From c27f1d13be69270c9bd8c4e134f646842bca094e Mon Sep 17 00:00:00 2001 From: QJ_wonder <38885395+BigdataQIJI@users.noreply.github.com> Date: Wed, 21 May 2025 11:19:32 +0800 Subject: [PATCH] =?UTF-8?q?(feature)(headless)=20=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E4=BD=99=E5=BC=A6=E7=9B=B8=E4=BC=BC=E5=BA=A6=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E5=B7=A5=E5=85=B7=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增余弦相似度计算方法,使用jieba分词,并计算余弦相似度 --- .../parser/llm/TextSimilarityCalculation.java | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 headless/chat/src/main/java/com/tencent/supersonic/headless/chat/parser/llm/TextSimilarityCalculation.java diff --git a/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/parser/llm/TextSimilarityCalculation.java b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/parser/llm/TextSimilarityCalculation.java new file mode 100644 index 000000000..4c2305abf --- /dev/null +++ b/headless/chat/src/main/java/com/tencent/supersonic/headless/chat/parser/llm/TextSimilarityCalculation.java @@ -0,0 +1,52 @@ +package com.tencent.supersonic.headless.chat.parser.llm; + +import com.huaban.analysis.jieba.JiebaSegmenter; +import lombok.extern.slf4j.Slf4j; + +import java.util.*; + +@Slf4j +public class TextSimilarityCalculation { + // 生成词频向量 + private static double[] createVector(List words, List vocabulary) { + double[] vector = new double[vocabulary.size()]; + Map wordFreq = new HashMap<>(); + for (String word : words) { + wordFreq.put(word, wordFreq.getOrDefault(word, 0) + 1); + } + for (int i = 0; i < vocabulary.size(); i++) { + vector[i] = wordFreq.getOrDefault(vocabulary.get(i), 0); + } + return vector; + } + // 余弦相似度计算公式 + private static double cosineSimilarity(double[] vecA, double[] vecB) { + double dotProduct = 0.0; + double normA = 0.0; + double normB = 0.0; + for (int i = 0; i < vecA.length; i++) { + dotProduct += vecA[i] * vecB[i]; + normA += Math.pow(vecA[i], 2); + normB += Math.pow(vecB[i], 2); + } + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); + } + + public static double getDataSetSimilarity(String queryText, String datasetName){ + if(queryText ==null || datasetName == null ){ return 0.0;} + JiebaSegmenter segmenter = new JiebaSegmenter(); + + // 1.分词 + List words1 = segmenter.sentenceProcess(queryText); + List words2 = segmenter.sentenceProcess(datasetName); + // 2. 构建词汇表并生成向量 + List vocabulary = new ArrayList<>(new HashSet<>(words1)); + vocabulary.addAll(new HashSet<>(words2)); + + double[] vector1 = createVector(words1, vocabulary); + double[] vector2 = createVector(words2, vocabulary); + // 计算相似度(示例使用简单重叠度计算) + double similarity = cosineSimilarity(vector1, vector2); + return similarity; + } +}