package simplify import ( "slices" "strings" "unicode" ) // 模拟ES部分分词规则,对查询关键字进行去重 type Words struct { whole []string input []string } func NewWords(whole []string, input []string) *Words { return &Words{whole: whole, input: input} } func (w *Words) Simplify() []string { // 分隔符列表,与ES搜索的分隔符保持一致 separators := w.separators() keywords := make([]string, 0) keywords = append(keywords, w.input...) keywords = append(keywords, w.whole...) wides := make(map[string]string) for _, keyword := range keywords { // 转换为小写并去除两端空白字符,规避大小写造成异常剔除 keyword = strings.ToLower(strings.TrimSpace(keyword)) if keyword == "" { continue } // 将分隔符替换为空格 str := keyword for _, separator := range separators { str = strings.ReplaceAll(str, separator, " ") } wide := " " + strings.Join(strings.Fields(str), " ") + " " // 空格开头结尾,便于包含判断 if strings.TrimSpace(wide) == "" { continue } wides[keyword] = wide } // 计算重复次数 repetition := make(map[string]int) for _, wide1 := range wides { for keyword, wide2 := range wides { // 首尾空格字符串 包含另一个 首尾空格字符串,则认为重复 if strings.Contains(wide2, wide1) { repetition[keyword]++ continue } // 子串 在 首尾空格父串 中, 前后位置的字符 是 中文或空格,也认为重复 before, after := w.checkSurroundingChineseOrSpace([]rune(wide2), []rune(strings.TrimSpace(wide1))) if before && after { repetition[keyword]++ continue } } } // 筛选出不重复且不是输入的字符串 filteredInput := make([]string, 0) filteredWhole := make([]string, 0) for keyword, times := range repetition { if times == 1 { if slices.Contains(w.input, keyword) { filteredInput = append(filteredInput, keyword) } else { filteredWhole = append(filteredWhole, keyword) } } } result := make([]string, 0) result = append(result, filteredInput...) // ‘input’ 放在前面,保证输入字符串优先 result = append(result, filteredWhole...) return result } // isChineseOrSpace 判断给定的rune字符是否为中文或空格 func (w *Words) isChineseOrSpace(r rune) bool { return r == ' ' || unicode.Is(unicode.Scripts["Han"], r) } // CheckSurroundingChineseOrSpace 检查子串sub前后的UTF-8字符是否为中文或空格 func (w *Words) checkSurroundingChineseOrSpace(str []rune, sub []rune) (bool, bool) { // 找到子串在父串中的位置 startIndex := -1 lenStr := len(str) lenSub := len(sub) for i := 0; i <= lenStr-lenSub; i++ { if str[i] == sub[0] { // 找到子串的起始字符 match := true for j := 1; j < lenSub; j++ { if i+j >= lenStr || str[i+j] != sub[j] { match = false break } } if match { startIndex = i break } } } if startIndex == -1 { // 子串不在父串中 return false, false } // 检查子串前后的字符是否为中文或空格 var isBeforeChineseOrSpace, isAfterChineseOrSpace bool if startIndex > 0 { isBeforeChineseOrSpace = w.isChineseOrSpace(str[startIndex-1]) } if startIndex+len(sub) < len(str) { isAfterChineseOrSpace = w.isChineseOrSpace(str[startIndex+len(sub)]) } return isBeforeChineseOrSpace, isAfterChineseOrSpace } func (w *Words) separators() []string { // 分隔符列表,与ES搜索的分隔符保持一致 return []string{ "~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", // "_", "+", "`", "-", "=", "{", "}", "|", "[", "]", "\\", // ":", "\"", ";", // "'", "<", ">", "?", ",", // ".", "/", "\r\n", "\r", "\n", "\t", "!", "¥", "…", "(", ")", "—", // "·", "【", "】", "、", ";", // "‘", // "’", // ":", "“", "》", "《", "?", ",", "。", "、", } }