mirror of
https://github.com/tencentmusic/supersonic.git
synced 2026-06-26 06:39:20 +08:00
fix(mapper): 优先按相似度排序并保留所有完全匹配项
This commit is contained in:
@@ -9,6 +9,7 @@ import org.springframework.beans.factory.annotation.Autowired;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
@@ -50,8 +51,11 @@ public class HanlpDictMatchStrategy extends SingleMatchStrategy<HanlpMapResult>
|
|||||||
return new ArrayList<>();
|
return new ArrayList<>();
|
||||||
}
|
}
|
||||||
// step3. merge pre/suffix result
|
// step3. merge pre/suffix result
|
||||||
|
// sort by similarity (desc) first, then name length (desc), so that
|
||||||
|
// higher-similarity records are inserted first and survive LinkedHashSet dedup
|
||||||
hanlpMapResults = hanlpMapResults.stream()
|
hanlpMapResults = hanlpMapResults.stream()
|
||||||
.sorted((a, b) -> -(b.getName().length() - a.getName().length()))
|
.sorted(Comparator.comparingDouble(HanlpMapResult::getSimilarity).reversed()
|
||||||
|
.thenComparing((a, b) -> Integer.compare(b.getName().length(), a.getName().length())))
|
||||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||||
|
|
||||||
// step4. filter by similarity
|
// step4. filter by similarity
|
||||||
|
|||||||
@@ -123,15 +123,9 @@ public class MapFilter {
|
|||||||
.filter(SchemaElementMatch::isFullMatched).collect(Collectors.toList());
|
.filter(SchemaElementMatch::isFullMatched).collect(Collectors.toList());
|
||||||
|
|
||||||
if (!fullMatches.isEmpty()) {
|
if (!fullMatches.isEmpty()) {
|
||||||
// If there are objects with similarity=1.0, choose the one with the longest
|
// Keep all records with similarity=1.0, as they may correspond to different
|
||||||
// detectWord and smallest offset
|
// elementIds with the same detectWord
|
||||||
SchemaElementMatch bestMatch = fullMatches.stream()
|
result.addAll(fullMatches);
|
||||||
.max(Comparator.comparing(
|
|
||||||
(SchemaElementMatch match) -> match.getDetectWord().length()))
|
|
||||||
.orElse(null);
|
|
||||||
if (bestMatch != null) {
|
|
||||||
result.add(bestMatch);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// If there are no objects with similarity=1.0, keep all objects with similarity<1.0
|
// If there are no objects with similarity=1.0, keep all objects with similarity<1.0
|
||||||
result.addAll(group);
|
result.addAll(group);
|
||||||
|
|||||||
Reference in New Issue
Block a user