[improvement][chat] Add threshold judgment to field replacement (#1850)

This commit is contained in:
lexluo09
2024-10-28 13:57:35 +08:00
committed by GitHub
parent c07b64d33c
commit 920c6e2846
9 changed files with 132 additions and 78 deletions

View File

@@ -0,0 +1,37 @@
package com.tencent.supersonic.common.jsqlparser;
public class EditDistanceUtils {
public static double getSimilarity(String word1, String word2) {
return 1 - (double) editDistance(word1, word2) / Math.max(word2.length(), word1.length());
}
public static int editDistance(String word1, String word2) {
final int m = word1.length();
final int n = word2.length();
int[][] dp = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j) {
dp[0][j] = j;
}
for (int i = 0; i <= m; ++i) {
dp[i][0] = i;
}
for (int i = 1; i <= m; ++i) {
char ci = word1.charAt(i - 1);
for (int j = 1; j <= n; ++j) {
char cj = word2.charAt(j - 1);
if (ci == cj) {
dp[i][j] = dp[i - 1][j - 1];
} else if (i > 1 && j > 1 && ci == word2.charAt(j - 2)
&& cj == word1.charAt(i - 2)) {
dp[i][j] = 1 + Math.min(dp[i - 2][j - 2], Math.min(dp[i][j - 1], dp[i - 1][j]));
} else {
dp[i][j] = Math.min(dp[i - 1][j - 1] + 1,
Math.min(dp[i][j - 1] + 1, dp[i - 1][j] + 1));
}
}
}
return dp[m][n];
}
}

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.ExpressionVisitorAdapter;
import net.sf.jsqlparser.expression.Function;
@@ -9,7 +10,6 @@ import java.util.Map;
@Slf4j
public class FieldReplaceVisitor extends ExpressionVisitorAdapter {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private ThreadLocal<Boolean> exactReplace = ThreadLocal.withInitial(() -> false);
@@ -20,7 +20,8 @@ public class FieldReplaceVisitor extends ExpressionVisitorAdapter {
@Override
public void visit(Column column) {
parseVisitorHelper.replaceColumn(column, fieldNameMap, exactReplace.get());
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
replaceService.replaceColumn(column, fieldNameMap, exactReplace.get());
}
@Override

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.DoubleValue;
import net.sf.jsqlparser.expression.Expression;
@@ -27,7 +28,6 @@ import java.util.Objects;
@Slf4j
public class FieldValueReplaceVisitor extends ExpressionVisitorAdapter {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private boolean exactReplace;
private Map<String, Map<String, String>> filedNameToValueMap;
@@ -138,7 +138,8 @@ public class FieldValueReplaceVisitor extends ExpressionVisitorAdapter {
private String getReplaceValue(Map<String, String> valueMap, String beforeValue) {
String afterValue = valueMap.get(String.valueOf(beforeValue));
if (StringUtils.isEmpty(afterValue) && !exactReplace) {
return parseVisitorHelper.getReplaceValue(beforeValue, valueMap, false);
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
return replaceService.getReplaceValue(beforeValue, valueMap, false);
}
return afterValue;
}

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
@@ -14,7 +15,6 @@ import java.util.Map;
@Slf4j
public class GroupByReplaceVisitor implements GroupByVisitor {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private boolean exactReplace;
@@ -34,10 +34,11 @@ public class GroupByReplaceVisitor implements GroupByVisitor {
}
private void replaceExpression(Expression expression) {
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
if (expression instanceof Column) {
parseVisitorHelper.replaceColumn((Column) expression, fieldNameMap, exactReplace);
replaceService.replaceColumn((Column) expression, fieldNameMap, exactReplace);
} else if (expression instanceof Function) {
parseVisitorHelper.replaceFunction((Function) expression, fieldNameMap, exactReplace);
replaceService.replaceFunction((Function) expression, fieldNameMap, exactReplace);
}
}
}

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.ContextUtils;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
import net.sf.jsqlparser.schema.Column;
@@ -9,8 +10,6 @@ import net.sf.jsqlparser.statement.select.OrderByVisitorAdapter;
import java.util.Map;
public class OrderByReplaceVisitor extends OrderByVisitorAdapter {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private boolean exactReplace;
@@ -22,11 +21,12 @@ public class OrderByReplaceVisitor extends OrderByVisitorAdapter {
@Override
public void visit(OrderByElement orderBy) {
Expression expression = orderBy.getExpression();
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
if (expression instanceof Column) {
parseVisitorHelper.replaceColumn((Column) expression, fieldNameMap, exactReplace);
replaceService.replaceColumn((Column) expression, fieldNameMap, exactReplace);
}
if (expression instanceof Function) {
parseVisitorHelper.replaceFunction((Function) expression, fieldNameMap, exactReplace);
replaceService.replaceFunction((Function) expression, fieldNameMap, exactReplace);
}
super.visit(orderBy);
}

View File

@@ -1,12 +1,15 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.StringUtil;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
import net.sf.jsqlparser.expression.operators.relational.ExpressionList;
import net.sf.jsqlparser.schema.Column;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import java.util.Map;
import java.util.Map.Entry;
@@ -14,7 +17,12 @@ import java.util.Optional;
import java.util.stream.Collectors;
@Slf4j
public class ParseVisitorHelper {
@Service
@Data
public class ReplaceService {
@Value("${s2.replace.column.threshold:0.4}")
private double replaceColumnThreshold;
public void replaceFunction(Function expression, Map<String, String> fieldNameMap,
boolean exactReplace) {
@@ -38,9 +46,9 @@ public class ParseVisitorHelper {
public String getReplaceValue(String beforeValue, Map<String, String> valueMap,
boolean exactReplace) {
String value = valueMap.get(beforeValue);
if (StringUtils.isNotBlank(value)) {
return value;
String replaceValue = valueMap.get(beforeValue);
if (StringUtils.isNotBlank(replaceValue)) {
return replaceValue;
}
if (exactReplace) {
return null;
@@ -48,47 +56,18 @@ public class ParseVisitorHelper {
Optional<Entry<String, String>> first = valueMap.entrySet().stream().sorted((k1, k2) -> {
String k1Value = k1.getKey();
String k2Value = k2.getKey();
Double k1Similarity = getSimilarity(beforeValue, k1Value);
Double k2Similarity = getSimilarity(beforeValue, k2Value);
Double k1Similarity = EditDistanceUtils.getSimilarity(beforeValue, k1Value);
Double k2Similarity = EditDistanceUtils.getSimilarity(beforeValue, k2Value);
return k2Similarity.compareTo(k1Similarity);
}).collect(Collectors.toList()).stream().findFirst();
if (first.isPresent()) {
return first.get().getValue();
replaceValue = first.get().getValue();
double similarity = EditDistanceUtils.getSimilarity(beforeValue, replaceValue);
if (similarity > replaceColumnThreshold) {
return replaceValue;
}
}
return beforeValue;
}
public static int editDistance(String word1, String word2) {
final int m = word1.length();
final int n = word2.length();
int[][] dp = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j) {
dp[0][j] = j;
}
for (int i = 0; i <= m; ++i) {
dp[i][0] = i;
}
for (int i = 1; i <= m; ++i) {
char ci = word1.charAt(i - 1);
for (int j = 1; j <= n; ++j) {
char cj = word2.charAt(j - 1);
if (ci == cj) {
dp[i][j] = dp[i - 1][j - 1];
} else if (i > 1 && j > 1 && ci == word2.charAt(j - 2)
&& cj == word1.charAt(i - 2)) {
dp[i][j] = 1 + Math.min(dp[i - 2][j - 2], Math.min(dp[i][j - 1], dp[i - 1][j]));
} else {
dp[i][j] = Math.min(dp[i - 1][j - 1] + 1,
Math.min(dp[i][j - 1] + 1, dp[i - 1][j] + 1));
}
}
}
return dp[m][n];
}
public double getSimilarity(String word1, String word2) {
return 1 - (double) editDistance(word1, word2) / Math.max(word2.length(), word1.length());
}
}