Simplify包：模拟ES部分分词规则，对查询关键字进行去重

1 week ago · cf8f88dd5d
parent 6383ebbf48
commit cf8f88dd5d
2 changed files with 175 additions and 0 deletions
--- a/simplify/codes.go
+++ b/simplify/codes.go
@ -0,0 +1,3 @@
 package simplify
 //TODO 如果是层级筛选，有 父级code 就不再需要 子级code 了
--- a/simplify/words.go
+++ b/simplify/words.go
@ -0,0 +1,172 @@
 package simplify
 import (
 	"slices"
 	"strings"
 	"unicode"
 )
 // 模拟ES部分分词规则，对查询关键字进行去重
 type Words struct {
 	whole []string
 	input []string
 }
 func NewWords(whole []string, input []string) *Words {
 	return &Words{whole: whole, input: input}
 }
 func (w *Words) Simplify() []string {
 	// 分隔符列表，与ES搜索的分隔符保持一致
 	separators := w.separators()
 	keywords := make([]string, 0)
 	keywords = append(keywords, w.input...)
 	keywords = append(keywords, w.whole...)
 	wides := make(map[string]string)
 	for _, keyword := range keywords {
 		// 转换为小写并去除两端空白字符，规避大小写造成异常剔除
 		keyword = strings.ToLower(strings.TrimSpace(keyword))
 		if keyword == "" {
 			continue
 		}
 		// 将分隔符替换为空格
 		str := keyword
 		for _, separator := range separators {
 			str = strings.ReplaceAll(str, separator, " ")
 		}
 		wide := " " + strings.Join(strings.Fields(str), " ") + " " // 空格开头结尾，便于包含判断
 		if strings.TrimSpace(wide) == "" {
 			continue
 		}
 		wides[keyword] = wide
 	}
 	// 计算重复次数
 	repetition := make(map[string]int)
 	for _, wide1 := range wides {
 		for keyword, wide2 := range wides {
 			// 首尾空格字符串 包含另一个 首尾空格字符串，则认为重复
 			if strings.Contains(wide2, wide1) {
 				repetition[keyword]++
 				continue
 			}
 			// 子串 在 首尾空格父串 中， 前后位置的字符 是 中文或空格，也认为重复
 			before, after := w.checkSurroundingChineseOrSpace([]rune(wide2), []rune(strings.TrimSpace(wide1)))
 			if before && after {
 				repetition[keyword]++
 				continue
 			}
 		}
 	}
 	// 筛选出不重复且不是输入的字符串
 	filteredInput := make([]string, 0)
 	filteredWhole := make([]string, 0)
 	for keyword, times := range repetition {
 		if times == 1 {
 			if slices.Contains(w.input, keyword) {
 				filteredInput = append(filteredInput, keyword)
 			} else {
 				filteredWhole = append(filteredWhole, keyword)
 			}
 		}
 	}
 	result := make([]string, 0)
 	result = append(result, filteredInput...) // ‘input’ 放在前面，保证输入字符串优先
 	result = append(result, filteredWhole...)
 	return result
 }
 // isChineseOrSpace 判断给定的rune字符是否为中文或空格
 func (w *Words) isChineseOrSpace(r rune) bool {
 	return r == ' ' || unicode.Is(unicode.Scripts["Han"], r)
 }
 // CheckSurroundingChineseOrSpace 检查子串sub前后的UTF-8字符是否为中文或空格
 func (w *Words) checkSurroundingChineseOrSpace(str []rune, sub []rune) (bool, bool) {
 	// 找到子串在父串中的位置
 	startIndex := -1
 	lenStr := len(str)
 	lenSub := len(sub)
 	for i := 0; i <= lenStr-lenSub; i++ {
 		if str[i] == sub[0] { // 找到子串的起始字符
 			match := true
 			for j := 1; j < lenSub; j++ {
 				if i+j >= lenStr || str[i+j] != sub[j] {
 					match = false
 					break
 				}
 			}
 			if match {
 				startIndex = i
 				break
 			}
 		}
 	}
 	if startIndex == -1 {
 		// 子串不在父串中
 		return false, false
 	}
 	// 检查子串前后的字符是否为中文或空格
 	var isBeforeChineseOrSpace, isAfterChineseOrSpace bool
 	if startIndex > 0 {
 		isBeforeChineseOrSpace = w.isChineseOrSpace(str[startIndex-1])
 	}
 	if startIndex+len(sub) < len(str) {
 		isAfterChineseOrSpace = w.isChineseOrSpace(str[startIndex+len(sub)])
 	}
 	return isBeforeChineseOrSpace, isAfterChineseOrSpace
 }
 func (w *Words) separators() []string {
 	// 分隔符列表，与ES搜索的分隔符保持一致
 	return []string{
 		"~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")",
 		// "_",
 		"+",
 		"`", "-", "=",
 		"{", "}", "|",
 		"[", "]", "\\",
 		// ":",
 		"\"",
 		";",
 		// "'",
 		"<", ">", "?",
 		",",
 		// ".",
 		"/",
 		"\r\n", "\r", "\n", "\t",
 		"！", "￥", "…", "（", "）", "—",
 		// "·",
 		"【", "】", "、",
 		"；",
 		// "‘",
 		// "’",
 		// "：",
 		"“",
 		"》", "《", "？",
 		"，", "。", "、",
 	}
 }
		`@ -0,0 +1,3 @@`
							`package simplify`

							`//TODO 如果是层级筛选，有父级code 就不再需要子级code 了`