From f9c60d0c65967bd35549418a67c4582a808085e5 Mon Sep 17 00:00:00 2001 From: lexluo09 <39718951+lexluo09@users.noreply.github.com> Date: Thu, 18 Apr 2024 10:38:18 +0800 Subject: [PATCH] (improvement)(Headless) Adjust file paths and fix the issue of inserting duplicate data in embeddings. (#918) --- .../embedding/InMemoryS2EmbeddingStore.java | 29 ++++++++++--------- .../listener}/EmbeddingInitListener.java | 2 +- 2 files changed, 17 insertions(+), 14 deletions(-) rename {launchers/standalone/src/main/java/com/tencent/supersonic => headless/server/src/main/java/com/tencent/supersonic/headless/server/listener}/EmbeddingInitListener.java (96%) diff --git a/common/src/main/java/com/tencent/supersonic/common/util/embedding/InMemoryS2EmbeddingStore.java b/common/src/main/java/com/tencent/supersonic/common/util/embedding/InMemoryS2EmbeddingStore.java index 6c063e19d..6acfe7805 100644 --- a/common/src/main/java/com/tencent/supersonic/common/util/embedding/InMemoryS2EmbeddingStore.java +++ b/common/src/main/java/com/tencent/supersonic/common/util/embedding/InMemoryS2EmbeddingStore.java @@ -1,10 +1,5 @@ package com.tencent.supersonic.common.util.embedding; -import static dev.langchain4j.internal.Utils.randomUUID; -import static java.nio.file.StandardOpenOption.CREATE; -import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING; -import static java.util.Comparator.comparingDouble; - import com.tencent.supersonic.common.config.EmbeddingConfig; import com.tencent.supersonic.common.util.ContextUtils; import dev.langchain4j.data.embedding.Embedding; @@ -13,6 +8,10 @@ import dev.langchain4j.store.embedding.CosineSimilarity; import dev.langchain4j.store.embedding.EmbeddingMatch; import dev.langchain4j.store.embedding.EmbeddingStore; import dev.langchain4j.store.embedding.RelevanceScore; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.collections4.MapUtils; +import org.apache.commons.lang3.StringUtils; + import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -20,6 +19,7 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -29,9 +29,11 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArraySet; import java.util.stream.Collectors; -import lombok.extern.slf4j.Slf4j; -import org.apache.commons.collections4.MapUtils; -import org.apache.commons.lang3.StringUtils; + +import static dev.langchain4j.internal.Utils.randomUUID; +import static java.nio.file.StandardOpenOption.CREATE; +import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING; +import static java.util.Comparator.comparingDouble; /*** * Implementation of S2EmbeddingStore within the Java process's in-memory. @@ -134,7 +136,8 @@ public class InMemoryS2EmbeddingStore implements S2EmbeddingStore { retrieval.setDistance(1 - embeddingMatch.score()); retrieval.setId(embeddingMatch.embeddingId()); retrieval.setQuery(embeddingMatch.embedded().getQuery()); - Map metadata = embeddingMatch.embedded().getMetadata(); + Map metadata = new HashMap<>(); + metadata.putAll(embeddingMatch.embedded().getMetadata()); if (filterRetrieval(filterCondition, metadata)) { continue; } @@ -179,9 +182,9 @@ public class InMemoryS2EmbeddingStore implements S2EmbeddingStore { * Uses a brute force approach by iterating over all embeddings to find the best matches. * * @param The class of the object that has been embedded. - * Typically, it is {@link dev.langchain4j.data.segment.TextSegment}. - * copy from dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore - * and fix concurrentModificationException in a multi-threaded environment + * Typically, it is {@link dev.langchain4j.data.segment.TextSegment}. + * copy from dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore + * and fix concurrentModificationException in a multi-threaded environment */ public static class InMemoryEmbeddingStore implements EmbeddingStore { @@ -267,7 +270,7 @@ public class InMemoryS2EmbeddingStore implements S2EmbeddingStore { @Override public List> findRelevant(Embedding referenceEmbedding, int maxResults, - double minScore) { + double minScore) { Comparator> comparator = comparingDouble(EmbeddingMatch::score); PriorityQueue> matches = new PriorityQueue<>(comparator); diff --git a/launchers/standalone/src/main/java/com/tencent/supersonic/EmbeddingInitListener.java b/headless/server/src/main/java/com/tencent/supersonic/headless/server/listener/EmbeddingInitListener.java similarity index 96% rename from launchers/standalone/src/main/java/com/tencent/supersonic/EmbeddingInitListener.java rename to headless/server/src/main/java/com/tencent/supersonic/headless/server/listener/EmbeddingInitListener.java index 2b3b5715f..20e50e7e6 100644 --- a/launchers/standalone/src/main/java/com/tencent/supersonic/EmbeddingInitListener.java +++ b/headless/server/src/main/java/com/tencent/supersonic/headless/server/listener/EmbeddingInitListener.java @@ -1,4 +1,4 @@ -package com.tencent.supersonic; +package com.tencent.supersonic.headless.server.listener; import com.tencent.supersonic.common.config.EmbeddingConfig; import com.tencent.supersonic.headless.core.chat.parser.JavaLLMProxy;