[improvement][chat] Add threshold judgment to field replacement (#1850)

This commit is contained in:
lexluo09
2024-10-28 13:57:35 +08:00
committed by GitHub
parent c07b64d33c
commit 920c6e2846
9 changed files with 132 additions and 78 deletions

View File

@@ -247,6 +247,12 @@
<groupId>org.codehaus.woodstox</groupId>
<artifactId>stax2-api</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-inline</artifactId>
<version>${mockito-inline.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,37 @@
package com.tencent.supersonic.common.jsqlparser;
public class EditDistanceUtils {
public static double getSimilarity(String word1, String word2) {
return 1 - (double) editDistance(word1, word2) / Math.max(word2.length(), word1.length());
}
public static int editDistance(String word1, String word2) {
final int m = word1.length();
final int n = word2.length();
int[][] dp = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j) {
dp[0][j] = j;
}
for (int i = 0; i <= m; ++i) {
dp[i][0] = i;
}
for (int i = 1; i <= m; ++i) {
char ci = word1.charAt(i - 1);
for (int j = 1; j <= n; ++j) {
char cj = word2.charAt(j - 1);
if (ci == cj) {
dp[i][j] = dp[i - 1][j - 1];
} else if (i > 1 && j > 1 && ci == word2.charAt(j - 2)
&& cj == word1.charAt(i - 2)) {
dp[i][j] = 1 + Math.min(dp[i - 2][j - 2], Math.min(dp[i][j - 1], dp[i - 1][j]));
} else {
dp[i][j] = Math.min(dp[i - 1][j - 1] + 1,
Math.min(dp[i][j - 1] + 1, dp[i - 1][j] + 1));
}
}
}
return dp[m][n];
}
}

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.ExpressionVisitorAdapter;
import net.sf.jsqlparser.expression.Function;
@@ -9,7 +10,6 @@ import java.util.Map;
@Slf4j
public class FieldReplaceVisitor extends ExpressionVisitorAdapter {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private ThreadLocal<Boolean> exactReplace = ThreadLocal.withInitial(() -> false);
@@ -20,7 +20,8 @@ public class FieldReplaceVisitor extends ExpressionVisitorAdapter {
@Override
public void visit(Column column) {
parseVisitorHelper.replaceColumn(column, fieldNameMap, exactReplace.get());
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
replaceService.replaceColumn(column, fieldNameMap, exactReplace.get());
}
@Override

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.DoubleValue;
import net.sf.jsqlparser.expression.Expression;
@@ -27,7 +28,6 @@ import java.util.Objects;
@Slf4j
public class FieldValueReplaceVisitor extends ExpressionVisitorAdapter {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private boolean exactReplace;
private Map<String, Map<String, String>> filedNameToValueMap;
@@ -138,7 +138,8 @@ public class FieldValueReplaceVisitor extends ExpressionVisitorAdapter {
private String getReplaceValue(Map<String, String> valueMap, String beforeValue) {
String afterValue = valueMap.get(String.valueOf(beforeValue));
if (StringUtils.isEmpty(afterValue) && !exactReplace) {
return parseVisitorHelper.getReplaceValue(beforeValue, valueMap, false);
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
return replaceService.getReplaceValue(beforeValue, valueMap, false);
}
return afterValue;
}

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.ContextUtils;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
@@ -14,7 +15,6 @@ import java.util.Map;
@Slf4j
public class GroupByReplaceVisitor implements GroupByVisitor {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private boolean exactReplace;
@@ -34,10 +34,11 @@ public class GroupByReplaceVisitor implements GroupByVisitor {
}
private void replaceExpression(Expression expression) {
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
if (expression instanceof Column) {
parseVisitorHelper.replaceColumn((Column) expression, fieldNameMap, exactReplace);
replaceService.replaceColumn((Column) expression, fieldNameMap, exactReplace);
} else if (expression instanceof Function) {
parseVisitorHelper.replaceFunction((Function) expression, fieldNameMap, exactReplace);
replaceService.replaceFunction((Function) expression, fieldNameMap, exactReplace);
}
}
}

View File

@@ -1,5 +1,6 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.ContextUtils;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
import net.sf.jsqlparser.schema.Column;
@@ -9,8 +10,6 @@ import net.sf.jsqlparser.statement.select.OrderByVisitorAdapter;
import java.util.Map;
public class OrderByReplaceVisitor extends OrderByVisitorAdapter {
ParseVisitorHelper parseVisitorHelper = new ParseVisitorHelper();
private Map<String, String> fieldNameMap;
private boolean exactReplace;
@@ -22,11 +21,12 @@ public class OrderByReplaceVisitor extends OrderByVisitorAdapter {
@Override
public void visit(OrderByElement orderBy) {
Expression expression = orderBy.getExpression();
ReplaceService replaceService = ContextUtils.getBean(ReplaceService.class);
if (expression instanceof Column) {
parseVisitorHelper.replaceColumn((Column) expression, fieldNameMap, exactReplace);
replaceService.replaceColumn((Column) expression, fieldNameMap, exactReplace);
}
if (expression instanceof Function) {
parseVisitorHelper.replaceFunction((Function) expression, fieldNameMap, exactReplace);
replaceService.replaceFunction((Function) expression, fieldNameMap, exactReplace);
}
super.visit(orderBy);
}

View File

@@ -1,12 +1,15 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.util.StringUtil;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.expression.Expression;
import net.sf.jsqlparser.expression.Function;
import net.sf.jsqlparser.expression.operators.relational.ExpressionList;
import net.sf.jsqlparser.schema.Column;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import java.util.Map;
import java.util.Map.Entry;
@@ -14,7 +17,12 @@ import java.util.Optional;
import java.util.stream.Collectors;
@Slf4j
public class ParseVisitorHelper {
@Service
@Data
public class ReplaceService {
@Value("${s2.replace.column.threshold:0.4}")
private double replaceColumnThreshold;
public void replaceFunction(Function expression, Map<String, String> fieldNameMap,
boolean exactReplace) {
@@ -38,9 +46,9 @@ public class ParseVisitorHelper {
public String getReplaceValue(String beforeValue, Map<String, String> valueMap,
boolean exactReplace) {
String value = valueMap.get(beforeValue);
if (StringUtils.isNotBlank(value)) {
return value;
String replaceValue = valueMap.get(beforeValue);
if (StringUtils.isNotBlank(replaceValue)) {
return replaceValue;
}
if (exactReplace) {
return null;
@@ -48,47 +56,18 @@ public class ParseVisitorHelper {
Optional<Entry<String, String>> first = valueMap.entrySet().stream().sorted((k1, k2) -> {
String k1Value = k1.getKey();
String k2Value = k2.getKey();
Double k1Similarity = getSimilarity(beforeValue, k1Value);
Double k2Similarity = getSimilarity(beforeValue, k2Value);
Double k1Similarity = EditDistanceUtils.getSimilarity(beforeValue, k1Value);
Double k2Similarity = EditDistanceUtils.getSimilarity(beforeValue, k2Value);
return k2Similarity.compareTo(k1Similarity);
}).collect(Collectors.toList()).stream().findFirst();
if (first.isPresent()) {
return first.get().getValue();
replaceValue = first.get().getValue();
double similarity = EditDistanceUtils.getSimilarity(beforeValue, replaceValue);
if (similarity > replaceColumnThreshold) {
return replaceValue;
}
}
return beforeValue;
}
public static int editDistance(String word1, String word2) {
final int m = word1.length();
final int n = word2.length();
int[][] dp = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j) {
dp[0][j] = j;
}
for (int i = 0; i <= m; ++i) {
dp[i][0] = i;
}
for (int i = 1; i <= m; ++i) {
char ci = word1.charAt(i - 1);
for (int j = 1; j <= n; ++j) {
char cj = word2.charAt(j - 1);
if (ci == cj) {
dp[i][j] = dp[i - 1][j - 1];
} else if (i > 1 && j > 1 && ci == word2.charAt(j - 2)
&& cj == word1.charAt(i - 2)) {
dp[i][j] = 1 + Math.min(dp[i - 2][j - 2], Math.min(dp[i][j - 1], dp[i - 1][j]));
} else {
dp[i][j] = Math.min(dp[i - 1][j - 1] + 1,
Math.min(dp[i][j - 1] + 1, dp[i - 1][j] + 1));
}
}
}
return dp[m][n];
}
public double getSimilarity(String word1, String word2) {
return 1 - (double) editDistance(word1, word2) / Math.max(word2.length(), word1.length());
}
}

View File

@@ -18,9 +18,9 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFields(replaceSql, fieldToBizName);
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-08-08' AND publish_date <= '2023-08-09')"
+ " AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' AND song_publis_date = '2023-08-01'"
+ " ORDER BY play_count DESC LIMIT 11",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-08-08' AND publish_date "
+ "<= '2023-08-09') AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' AND "
+ "歌曲发布时 = '2023-08-01' ORDER BY 播放量 DESC LIMIT 11",
replaceSql);
}
@@ -77,9 +77,9 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFields(replaceSql, fieldToBizName);
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals("SELECT YEAR(发行日期), count(song_name) FROM 歌曲库 "
+ "WHERE YEAR(发行日期) IN (2022, 2023) AND sys_imp_date = '2023-08-14' "
+ "GROUP BY YEAR(publish_date)", replaceSql);
Assert.assertEquals("SELECT YEAR(发行日期), count(song_name) FROM 歌曲库 WHERE "
+ "YEAR(发行日期) IN (2022, 2023) AND sys_imp_date = '2023-08-14' GROUP BY YEAR(发行日期)",
replaceSql);
}
@Test
@@ -91,9 +91,10 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFields(replaceSql, fieldToBizName);
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals("SELECT YEAR(发行日期), count(song_name) FROM 歌曲库 "
+ "WHERE YEAR(发行日期) IN (2022, 2023) AND sys_imp_date = '2023-08-14'"
+ " GROUP BY publish_date", replaceSql);
Assert.assertEquals(
"SELECT YEAR(发行日期), count(song_name) FROM 歌曲库 WHERE YEAR(发行日期) "
+ "IN (2022, 2023) AND sys_imp_date = '2023-08-14' GROUP BY 发行日期",
replaceSql);
}
@@ -107,9 +108,8 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2022-08-11' "
+ "AND publish_date <= '2023-08-11') AND play_count > 1000000 AND "
+ "(sys_imp_date >= '2023-07-12' AND sys_imp_date <= '2023-08-11')",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2022-08-11' AND publish_date <= '2023-08-11')"
+ " AND 结算播放量 > 1000000 AND (sys_imp_date >= '2023-07-12' AND sys_imp_date <= '2023-08-11')",
replaceSql);
}
@@ -123,8 +123,9 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-08-08' AND publish_date <= '2023-08-09')"
+ " AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY play_count DESC LIMIT 11",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-08-08' AND publish_date "
+ "<= '2023-08-09') AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY "
+ "播放量 DESC LIMIT 11",
replaceSql);
}
@@ -138,8 +139,9 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-01-01' AND publish_date <= '2023-08-09')"
+ " AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY play_count DESC LIMIT 11",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-01-01' AND publish_date "
+ "<= '2023-08-09') AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' "
+ "ORDER BY 播放量 DESC LIMIT 11",
replaceSql);
}
@@ -153,8 +155,9 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals(
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-02-09' AND publish_date <= '2023-08-09')"
+ " AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY play_count DESC LIMIT 11",
"SELECT song_name FROM 歌曲库 WHERE (publish_date >= '2023-02-09' AND publish_date <="
+ " '2023-08-09') AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' "
+ "ORDER BY 播放量 DESC LIMIT 11",
replaceSql);
}
@@ -167,9 +170,9 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
fieldToBizName);
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
replaceSql = SqlRemoveHelper.removeNumberFilter(replaceSql);
Assert.assertEquals("SELECT song_name FROM 歌曲库 WHERE publish_date <= '2023-02-09' AND"
+ " singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09'"
+ " ORDER BY play_count DESC LIMIT 11", replaceSql);
Assert.assertEquals("SELECT song_name FROM 歌曲库 WHERE publish_date <= '2023-02-09' "
+ "AND singer_name = '邓紫棋' AND sys_imp_date = '2023-08-09' ORDER BY 播放量 DESC LIMIT 11",
replaceSql);
}
@Test
@@ -222,9 +225,8 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals(
"SELECT song_name, sum(评分) FROM CSpider WHERE (1 < 2) AND "
+ "sys_imp_date = '2023-10-15' GROUP BY song_name HAVING "
+ "sum(评分) < (SELECT min(评分) FROM CSpider WHERE user_id = '英文')",
"SELECT 歌曲名称, sum(评分) FROM CSpider WHERE (1 < 2) AND sys_imp_date = '2023-10-15' "
+ "GROUP BY 歌曲名称 HAVING sum(评分) < (SELECT min(评分) FROM CSpider WHERE 语种 = '英文')",
replaceSql);
}
@@ -239,9 +241,9 @@ class SqlReplaceFieldsTest extends SqlReplaceHelperTest {
replaceSql = SqlReplaceHelper.replaceFunction(replaceSql);
Assert.assertEquals(
"SELECT sum(评分) / (SELECT sum(评分) FROM CSpider WHERE sys_imp_date = '2023-10-15') "
+ "FROM CSpider WHERE sys_imp_date = '2023-10-15' GROUP BY song_name HAVING "
+ "sum(评分) < (SELECT min(评分) FROM CSpider WHERE user_id = '英文')",
"SELECT sum(评分) / (SELECT sum(评分) FROM CSpider WHERE sys_imp_date = '2023-10-15') FROM "
+ "CSpider WHERE sys_imp_date = '2023-10-15' GROUP BY 歌曲名称 HAVING sum(评分) < (SELECT min(评分) "
+ "FROM CSpider WHERE 语种 = '英文')",
replaceSql);
}

View File

@@ -1,9 +1,13 @@
package com.tencent.supersonic.common.jsqlparser;
import com.tencent.supersonic.common.pojo.enums.AggOperatorEnum;
import com.tencent.supersonic.common.util.ContextUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.junit.Assert;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.MockedStatic;
import java.util.Collections;
import java.util.HashMap;
@@ -11,10 +15,25 @@ import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import static org.mockito.Mockito.mockStatic;
/**
* SqlParserReplaceHelperTest
*/
class SqlReplaceHelperTest {
private MockedStatic<ContextUtils> mockedContextUtils;
@BeforeEach
public void setUp() {
ReplaceService replaceService = new ReplaceService();
replaceService.setReplaceColumnThreshold(0.0);
// Mock the static method ContextUtils.getBean
mockedContextUtils = mockStatic(ContextUtils.class);
mockedContextUtils.when(() -> ContextUtils.getBean(ReplaceService.class))
.thenReturn(replaceService);
}
@Test
void testReplaceAggField() {
String sql = "SELECT 维度1,sum(播放量) FROM 数据库 "
@@ -334,4 +353,12 @@ class SqlReplaceHelperTest {
fieldToBizName.put("访问次数", "pv");
return fieldToBizName;
}
@AfterEach
public void tearDown() {
// Close the mocked static context
if (mockedContextUtils != null) {
mockedContextUtils.close();
}
}
}