[improvement](project) Adding tag abstraction to the dictionary and optimizing related code. (#785)

This commit is contained in:
lexluo09
2024-03-05 18:00:48 +08:00
committed by GitHub
parent ba83c6ca81
commit b8ecfc6a99
27 changed files with 335 additions and 261 deletions

View File

@@ -35,4 +35,5 @@ public abstract class BaseWordBuilder {
}
protected abstract List<DictWord> doGet(String word, SchemaElement schemaElement);
}

View File

@@ -0,0 +1,25 @@
package com.tencent.supersonic.headless.core.knowledge.builder;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.core.knowledge.DictWord;
import java.util.ArrayList;
import java.util.List;
import org.springframework.util.CollectionUtils;
public abstract class BaseWordWithAliasBuilder extends BaseWordBuilder {
public abstract DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix);
public List<DictWord> getOneWordNatureAlias(SchemaElement schemaElement, boolean isSuffix) {
List<DictWord> dictWords = new ArrayList<>();
if (CollectionUtils.isEmpty(schemaElement.getAlias())) {
return dictWords;
}
for (String alias : schemaElement.getAlias()) {
dictWords.add(getOneWordNature(alias, schemaElement, isSuffix));
}
return dictWords;
}
}

View File

@@ -1,41 +1,32 @@
package com.tencent.supersonic.headless.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
/**
* dimension word nature
*/
@Service
public class DimensionWordBuilder extends BaseWordBuilder {
@Value("${nlp.dimension.use.suffix:true}")
private boolean nlpDimensionUseSuffix = true;
public class DimensionWordBuilder extends BaseWordWithAliasBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
result.add(getOnwWordNature(word, schemaElement, false));
result.addAll(getOnwWordNatureAlias(schemaElement, false));
if (nlpDimensionUseSuffix) {
String reverseWord = StringUtils.reverse(word);
if (StringUtils.isNotEmpty(word) && !word.equalsIgnoreCase(reverseWord)) {
result.add(getOnwWordNature(reverseWord, schemaElement, true));
}
result.add(getOneWordNature(word, schemaElement, false));
result.addAll(getOneWordNatureAlias(schemaElement, false));
String reverseWord = StringUtils.reverse(word);
if (StringUtils.isNotEmpty(word) && !word.equalsIgnoreCase(reverseWord)) {
result.add(getOneWordNature(reverseWord, schemaElement, true));
}
return result;
}
private DictWord getOnwWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
Long modelId = schemaElement.getModel();
@@ -49,16 +40,4 @@ public class DimensionWordBuilder extends BaseWordBuilder {
return dictWord;
}
private List<DictWord> getOnwWordNatureAlias(SchemaElement schemaElement, boolean isSuffix) {
List<DictWord> dictWords = new ArrayList<>();
if (CollectionUtils.isEmpty(schemaElement.getAlias())) {
return dictWords;
}
for (String alias : schemaElement.getAlias()) {
dictWords.add(getOnwWordNature(alias, schemaElement, false));
}
return dictWords;
}
}

View File

@@ -2,44 +2,38 @@ package com.tencent.supersonic.headless.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.List;
import java.util.Objects;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
/**
* dimension value wordNature
*/
@Service
@Slf4j
public class EntityWordBuilder extends BaseWordBuilder {
public class EntityWordBuilder extends BaseWordWithAliasBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
if (Objects.isNull(schemaElement)) {
return result;
}
Long modelId = schemaElement.getModel();
String nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.ENTITY.getType();
if (!CollectionUtils.isEmpty(schemaElement.getAlias())) {
schemaElement.getAlias().stream().forEach(alias -> {
DictWord dictWordAlias = new DictWord();
dictWordAlias.setWord(alias);
dictWordAlias.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY * 2, nature));
result.add(dictWordAlias);
});
}
result.add(getOneWordNature(word, schemaElement, false));
result.addAll(getOneWordNatureAlias(schemaElement, false));
return result;
}
@Override
public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
String nature = DictWordType.NATURE_SPILT + schemaElement.getModel()
+ DictWordType.NATURE_SPILT + schemaElement.getId() + DictWordType.ENTITY.getType();
DictWord dictWord = new DictWord();
dictWord.setWord(word);
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY * 2, nature));
return dictWord;
}
}

View File

@@ -1,41 +1,32 @@
package com.tencent.supersonic.headless.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
/**
* Metric DictWord
*/
@Service
public class MetricWordBuilder extends BaseWordBuilder {
@Value("${nlp.metric.use.suffix:true}")
private boolean nlpMetricUseSuffix = true;
public class MetricWordBuilder extends BaseWordWithAliasBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
result.add(getOnwWordNature(word, schemaElement, false));
result.addAll(getOnwWordNatureAlias(schemaElement, false));
if (nlpMetricUseSuffix) {
String reverseWord = StringUtils.reverse(word);
if (!word.equalsIgnoreCase(reverseWord)) {
result.add(getOnwWordNature(reverseWord, schemaElement, true));
}
result.add(getOneWordNature(word, schemaElement, false));
result.addAll(getOneWordNatureAlias(schemaElement, false));
String reverseWord = StringUtils.reverse(word);
if (!word.equalsIgnoreCase(reverseWord)) {
result.add(getOneWordNature(reverseWord, schemaElement, true));
}
return result;
}
private DictWord getOnwWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
Long modelId = schemaElement.getModel();
@@ -49,16 +40,4 @@ public class MetricWordBuilder extends BaseWordBuilder {
return dictWord;
}
private List<DictWord> getOnwWordNatureAlias(SchemaElement schemaElement, boolean isSuffix) {
List<DictWord> dictWords = new ArrayList<>();
if (CollectionUtils.isEmpty(schemaElement.getAlias())) {
return dictWords;
}
for (String alias : schemaElement.getAlias()) {
dictWords.add(getOnwWordNature(alias, schemaElement, false));
}
return dictWords;
}
}

View File

@@ -1,42 +1,36 @@
package com.tencent.supersonic.headless.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.List;
import java.util.Objects;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
/**
* model word nature
*/
@Service
@Slf4j
public class ModelWordBuilder extends BaseWordBuilder {
public class ModelWordBuilder extends BaseWordWithAliasBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
//modelName
DictWord dictWord = buildDictWord(word, schemaElement.getDataSet());
result.add(dictWord);
//alias
List<String> aliasList = schemaElement.getAlias();
if (CollectionUtils.isNotEmpty(aliasList)) {
for (String alias : aliasList) {
result.add(buildDictWord(alias, schemaElement.getDataSet()));
}
if (Objects.isNull(schemaElement)) {
return result;
}
result.add(getOneWordNature(word, schemaElement, false));
result.addAll(getOneWordNatureAlias(schemaElement, false));
return result;
}
private DictWord buildDictWord(String word, Long modelId) {
public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
String nature = DictWordType.NATURE_SPILT + modelId;
String nature = DictWordType.NATURE_SPILT + schemaElement.getDataSet();
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
return dictWord;
}

View File

@@ -0,0 +1,40 @@
package com.tencent.supersonic.headless.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.core.knowledge.DictWord;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
@Service
public class TagWordBuilder extends BaseWordWithAliasBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
result.add(getOneWordNature(word, schemaElement, false));
result.addAll(getOneWordNatureAlias(schemaElement, false));
String reverseWord = StringUtils.reverse(word);
if (!word.equalsIgnoreCase(reverseWord)) {
result.add(getOneWordNature(reverseWord, schemaElement, true));
}
return result;
}
public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
dictWord.setWord(word);
Long modelId = schemaElement.getModel();
String nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.TAG.getType();
if (isSuffix) {
nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId()
+ DictWordType.SUFFIX.getType() + DictWordType.TAG.getType();
}
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
return dictWord;
}
}

View File

@@ -2,40 +2,34 @@ package com.tencent.supersonic.headless.core.knowledge.builder;
import com.google.common.collect.Lists;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import com.tencent.supersonic.headless.api.pojo.SchemaElement;
import com.tencent.supersonic.headless.core.knowledge.DictWord;
import com.tencent.supersonic.common.pojo.enums.DictWordType;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;
import java.util.List;
import java.util.Objects;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
/**
* dimension value wordNature
*/
@Service
@Slf4j
public class ValueWordBuilder extends BaseWordBuilder {
public class ValueWordBuilder extends BaseWordWithAliasBuilder {
@Override
public List<DictWord> doGet(String word, SchemaElement schemaElement) {
List<DictWord> result = Lists.newArrayList();
if (Objects.nonNull(schemaElement) && !CollectionUtils.isEmpty(schemaElement.getAlias())) {
schemaElement.getAlias().stream().forEach(value -> {
DictWord dictWord = new DictWord();
Long modelId = schemaElement.getModel();
String nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId();
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
dictWord.setWord(value);
result.add(dictWord);
});
if (Objects.nonNull(schemaElement)) {
result.addAll(getOneWordNatureAlias(schemaElement, false));
}
log.debug("ValueWordBuilder, result:{}", result);
return result;
}
public DictWord getOneWordNature(String word, SchemaElement schemaElement, boolean isSuffix) {
DictWord dictWord = new DictWord();
Long modelId = schemaElement.getModel();
String nature = DictWordType.NATURE_SPILT + modelId + DictWordType.NATURE_SPILT + schemaElement.getId();
dictWord.setNatureWithFrequency(String.format("%s " + DEFAULT_FREQUENCY, nature));
dictWord.setWord(word);
return dictWord;
}
}

View File

@@ -18,6 +18,7 @@ public class WordBuilderFactory {
wordNatures.put(DictWordType.VIEW, new ModelWordBuilder());
wordNatures.put(DictWordType.ENTITY, new EntityWordBuilder());
wordNatures.put(DictWordType.VALUE, new ValueWordBuilder());
wordNatures.put(DictWordType.TAG, new TagWordBuilder());
}
public static BaseWordBuilder get(DictWordType strategyType) {