(improvement)(Headless) Adjust file paths and fix the issue of inserting duplicate data in embeddings. (#918)

This commit is contained in:
lexluo09
2024-04-18 10:38:18 +08:00
committed by GitHub
parent ee798b7671
commit f9c60d0c65
2 changed files with 17 additions and 14 deletions

View File

@@ -1,10 +1,5 @@
package com.tencent.supersonic.common.util.embedding;
import static dev.langchain4j.internal.Utils.randomUUID;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
import static java.util.Comparator.comparingDouble;
import com.tencent.supersonic.common.config.EmbeddingConfig;
import com.tencent.supersonic.common.util.ContextUtils;
import dev.langchain4j.data.embedding.Embedding;
@@ -13,6 +8,10 @@ import dev.langchain4j.store.embedding.CosineSimilarity;
import dev.langchain4j.store.embedding.EmbeddingMatch;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.RelevanceScore;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -20,6 +19,7 @@ import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -29,9 +29,11 @@ import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArraySet;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.lang3.StringUtils;
import static dev.langchain4j.internal.Utils.randomUUID;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
import static java.util.Comparator.comparingDouble;
/***
* Implementation of S2EmbeddingStore within the Java process's in-memory.
@@ -134,7 +136,8 @@ public class InMemoryS2EmbeddingStore implements S2EmbeddingStore {
retrieval.setDistance(1 - embeddingMatch.score());
retrieval.setId(embeddingMatch.embeddingId());
retrieval.setQuery(embeddingMatch.embedded().getQuery());
Map<String, Object> metadata = embeddingMatch.embedded().getMetadata();
Map<String, Object> metadata = new HashMap<>();
metadata.putAll(embeddingMatch.embedded().getMetadata());
if (filterRetrieval(filterCondition, metadata)) {
continue;
}
@@ -179,9 +182,9 @@ public class InMemoryS2EmbeddingStore implements S2EmbeddingStore {
* Uses a brute force approach by iterating over all embeddings to find the best matches.
*
* @param <Embedded> The class of the object that has been embedded.
* Typically, it is {@link dev.langchain4j.data.segment.TextSegment}.
* copy from dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore
* and fix concurrentModificationException in a multi-threaded environment
* Typically, it is {@link dev.langchain4j.data.segment.TextSegment}.
* copy from dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore
* and fix concurrentModificationException in a multi-threaded environment
*/
public static class InMemoryEmbeddingStore<Embedded> implements EmbeddingStore<Embedded> {
@@ -267,7 +270,7 @@ public class InMemoryS2EmbeddingStore implements S2EmbeddingStore {
@Override
public List<EmbeddingMatch<Embedded>> findRelevant(Embedding referenceEmbedding, int maxResults,
double minScore) {
double minScore) {
Comparator<EmbeddingMatch<Embedded>> comparator = comparingDouble(EmbeddingMatch::score);
PriorityQueue<EmbeddingMatch<Embedded>> matches = new PriorityQueue<>(comparator);

View File

@@ -1,4 +1,4 @@
package com.tencent.supersonic;
package com.tencent.supersonic.headless.server.listener;
import com.tencent.supersonic.common.config.EmbeddingConfig;
import com.tencent.supersonic.headless.core.chat.parser.JavaLLMProxy;