[improvement][headless]Refactor headless infra to support advanced semantic modelling.

This commit is contained in:
jerryjzhang
2024-12-04 14:40:30 +08:00
parent 9e24fd04a5
commit 28d5f38ffb
44 changed files with 64 additions and 5192 deletions

View File

@@ -19,19 +19,6 @@ public class Term {
this.nature = nature;
}
public Term(String word, Nature nature, int offset) {
this.word = word;
this.nature = nature;
this.offset = offset;
}
public Term(String word, Nature nature, int offset, int frequency) {
this.word = word;
this.nature = nature;
this.offset = offset;
this.frequency = frequency;
}
public int length() {
return this.word.length();
}

View File

@@ -8,7 +8,6 @@ import lombok.Data;
@Builder
public class DataItem {
/** * This field uses an underscore (_) at the end. */
private String id;
private String bizName;
@@ -19,9 +18,10 @@ public class DataItem {
private TypeEnums type;
/** * This field uses an underscore (_) at the end. */
private String modelId;
private String domainId;
private String defaultAgg;
public String getNewName() {

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.pojo;
import com.google.common.collect.Lists;
import lombok.Data;
import java.util.List;
@@ -18,5 +19,5 @@ public class ModelRela extends RecordInfo {
// left join, inner join, right join, outer join
private String joinType;
private List<JoinCondition> joinConditions;
private List<JoinCondition> joinConditions = Lists.newArrayList();
}

View File

@@ -1,6 +1,7 @@
package dev.langchain4j.store.embedding;
import com.alibaba.fastjson.JSONObject;
import com.tencent.supersonic.common.pojo.Constants;
import com.tencent.supersonic.common.pojo.DataItem;
import dev.langchain4j.data.document.Metadata;
import dev.langchain4j.data.segment.TextSegment;
@@ -17,10 +18,18 @@ public class TextSegmentConvert {
public static final String QUERY_ID = "queryId";
public static List<TextSegment> convertToEmbedding(List<DataItem> dataItems) {
return dataItems.stream().map(dataItem -> {
Map meta = JSONObject.parseObject(JSONObject.toJSONString(dataItem), Map.class);
TextSegment textSegment = TextSegment.from(dataItem.getName(), new Metadata(meta));
addQueryId(textSegment, dataItem.getId() + dataItem.getType().name().toLowerCase());
return dataItems.stream().map(item -> {
// suffix with underscore to avoid embedding issue
DataItem newItem = DataItem.builder().domainId(item.getDomainId())
.bizName(item.getBizName()).type(item.getType()).newName(item.getNewName())
.defaultAgg(item.getDefaultAgg()).name(item.getName())
.id(item.getId() + Constants.UNDERLINE)
.modelId(item.getModelId() + Constants.UNDERLINE)
.domainId(item.getDomainId() + Constants.UNDERLINE).build();
Map meta = JSONObject.parseObject(JSONObject.toJSONString(newItem), Map.class);
TextSegment textSegment = TextSegment.from(newItem.getName(), new Metadata(meta));
addQueryId(textSegment, newItem.getId() + newItem.getType().name().toLowerCase());
return textSegment;
}).collect(Collectors.toList());
}