From cf8f88dd5d35a48af361caf89f43a5dc351db40a Mon Sep 17 00:00:00 2001 From: songqingqing <18520276036@163.com> Date: Wed, 11 Dec 2024 16:20:20 +0800 Subject: [PATCH] =?UTF-8?q?Simplify=E5=8C=85=EF=BC=9A=20=E6=A8=A1=E6=8B=9F?= =?UTF-8?q?ES=E9=83=A8=E5=88=86=E5=88=86=E8=AF=8D=E8=A7=84=E5=88=99?= =?UTF-8?q?=EF=BC=8C=E5=AF=B9=E6=9F=A5=E8=AF=A2=E5=85=B3=E9=94=AE=E5=AD=97?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=8E=BB=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- simplify/codes.go | 3 + simplify/words.go | 172 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 simplify/codes.go create mode 100644 simplify/words.go diff --git a/simplify/codes.go b/simplify/codes.go new file mode 100644 index 0000000..34d5b50 --- /dev/null +++ b/simplify/codes.go @@ -0,0 +1,3 @@ +package simplify + +//TODO 如果是层级筛选,有 父级code 就不再需要 子级code 了 diff --git a/simplify/words.go b/simplify/words.go new file mode 100644 index 0000000..4d511ce --- /dev/null +++ b/simplify/words.go @@ -0,0 +1,172 @@ +package simplify + +import ( + "slices" + "strings" + "unicode" +) + +// 模拟ES部分分词规则,对查询关键字进行去重 + +type Words struct { + whole []string + input []string +} + +func NewWords(whole []string, input []string) *Words { + return &Words{whole: whole, input: input} +} + +func (w *Words) Simplify() []string { + + // 分隔符列表,与ES搜索的分隔符保持一致 + separators := w.separators() + + keywords := make([]string, 0) + keywords = append(keywords, w.input...) + keywords = append(keywords, w.whole...) + + wides := make(map[string]string) + for _, keyword := range keywords { + // 转换为小写并去除两端空白字符,规避大小写造成异常剔除 + keyword = strings.ToLower(strings.TrimSpace(keyword)) + if keyword == "" { + continue + } + // 将分隔符替换为空格 + str := keyword + for _, separator := range separators { + str = strings.ReplaceAll(str, separator, " ") + } + wide := " " + strings.Join(strings.Fields(str), " ") + " " // 空格开头结尾,便于包含判断 + if strings.TrimSpace(wide) == "" { + continue + } + wides[keyword] = wide + } + + // 计算重复次数 + repetition := make(map[string]int) + for _, wide1 := range wides { + for keyword, wide2 := range wides { + + // 首尾空格字符串 包含另一个 首尾空格字符串,则认为重复 + if strings.Contains(wide2, wide1) { + repetition[keyword]++ + continue + } + + // 子串 在 首尾空格父串 中, 前后位置的字符 是 中文或空格,也认为重复 + before, after := w.checkSurroundingChineseOrSpace([]rune(wide2), []rune(strings.TrimSpace(wide1))) + if before && after { + repetition[keyword]++ + continue + } + } + } + + // 筛选出不重复且不是输入的字符串 + filteredInput := make([]string, 0) + filteredWhole := make([]string, 0) + for keyword, times := range repetition { + if times == 1 { + if slices.Contains(w.input, keyword) { + filteredInput = append(filteredInput, keyword) + } else { + filteredWhole = append(filteredWhole, keyword) + } + } + } + + result := make([]string, 0) + result = append(result, filteredInput...) // ‘input’ 放在前面,保证输入字符串优先 + result = append(result, filteredWhole...) + return result +} + +// isChineseOrSpace 判断给定的rune字符是否为中文或空格 +func (w *Words) isChineseOrSpace(r rune) bool { + return r == ' ' || unicode.Is(unicode.Scripts["Han"], r) +} + +// CheckSurroundingChineseOrSpace 检查子串sub前后的UTF-8字符是否为中文或空格 +func (w *Words) checkSurroundingChineseOrSpace(str []rune, sub []rune) (bool, bool) { + // 找到子串在父串中的位置 + startIndex := -1 + lenStr := len(str) + lenSub := len(sub) + for i := 0; i <= lenStr-lenSub; i++ { + if str[i] == sub[0] { // 找到子串的起始字符 + match := true + for j := 1; j < lenSub; j++ { + if i+j >= lenStr || str[i+j] != sub[j] { + match = false + break + } + } + if match { + startIndex = i + break + } + } + } + + if startIndex == -1 { + // 子串不在父串中 + return false, false + } + + // 检查子串前后的字符是否为中文或空格 + var isBeforeChineseOrSpace, isAfterChineseOrSpace bool + if startIndex > 0 { + isBeforeChineseOrSpace = w.isChineseOrSpace(str[startIndex-1]) + } + if startIndex+len(sub) < len(str) { + isAfterChineseOrSpace = w.isChineseOrSpace(str[startIndex+len(sub)]) + } + + return isBeforeChineseOrSpace, isAfterChineseOrSpace +} + +func (w *Words) separators() []string { + // 分隔符列表,与ES搜索的分隔符保持一致 + return []string{ + "~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", + // "_", + "+", + + "`", "-", "=", + "{", "}", "|", + "[", "]", "\\", + + // ":", + "\"", + + ";", + // "'", + + "<", ">", "?", + + ",", + // ".", + "/", + + "\r\n", "\r", "\n", "\t", + + "!", "¥", "…", "(", ")", "—", + + // "·", + + "【", "】", "、", + + ";", + // "‘", + // "’", + + // ":", + "“", + + "》", "《", "?", + ",", "。", "、", + } +}