You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
173 lines
3.9 KiB
173 lines
3.9 KiB
2 weeks ago
|
package simplify
|
||
|
|
||
|
import (
|
||
|
"slices"
|
||
|
"strings"
|
||
|
"unicode"
|
||
|
)
|
||
|
|
||
|
// 模拟ES部分分词规则,对查询关键字进行去重
|
||
|
|
||
|
type Words struct {
|
||
|
whole []string
|
||
|
input []string
|
||
|
}
|
||
|
|
||
|
func NewWords(whole []string, input []string) *Words {
|
||
|
return &Words{whole: whole, input: input}
|
||
|
}
|
||
|
|
||
|
func (w *Words) Simplify() []string {
|
||
|
|
||
|
// 分隔符列表,与ES搜索的分隔符保持一致
|
||
|
separators := w.separators()
|
||
|
|
||
|
keywords := make([]string, 0)
|
||
|
keywords = append(keywords, w.input...)
|
||
|
keywords = append(keywords, w.whole...)
|
||
|
|
||
|
wides := make(map[string]string)
|
||
|
for _, keyword := range keywords {
|
||
|
// 转换为小写并去除两端空白字符,规避大小写造成异常剔除
|
||
|
keyword = strings.ToLower(strings.TrimSpace(keyword))
|
||
|
if keyword == "" {
|
||
|
continue
|
||
|
}
|
||
|
// 将分隔符替换为空格
|
||
|
str := keyword
|
||
|
for _, separator := range separators {
|
||
|
str = strings.ReplaceAll(str, separator, " ")
|
||
|
}
|
||
|
wide := " " + strings.Join(strings.Fields(str), " ") + " " // 空格开头结尾,便于包含判断
|
||
|
if strings.TrimSpace(wide) == "" {
|
||
|
continue
|
||
|
}
|
||
|
wides[keyword] = wide
|
||
|
}
|
||
|
|
||
|
// 计算重复次数
|
||
|
repetition := make(map[string]int)
|
||
|
for _, wide1 := range wides {
|
||
|
for keyword, wide2 := range wides {
|
||
|
|
||
|
// 首尾空格字符串 包含另一个 首尾空格字符串,则认为重复
|
||
|
if strings.Contains(wide2, wide1) {
|
||
|
repetition[keyword]++
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// 子串 在 首尾空格父串 中, 前后位置的字符 是 中文或空格,也认为重复
|
||
|
before, after := w.checkSurroundingChineseOrSpace([]rune(wide2), []rune(strings.TrimSpace(wide1)))
|
||
|
if before && after {
|
||
|
repetition[keyword]++
|
||
|
continue
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// 筛选出不重复且不是输入的字符串
|
||
|
filteredInput := make([]string, 0)
|
||
|
filteredWhole := make([]string, 0)
|
||
|
for keyword, times := range repetition {
|
||
|
if times == 1 {
|
||
|
if slices.Contains(w.input, keyword) {
|
||
|
filteredInput = append(filteredInput, keyword)
|
||
|
} else {
|
||
|
filteredWhole = append(filteredWhole, keyword)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
result := make([]string, 0)
|
||
|
result = append(result, filteredInput...) // ‘input’ 放在前面,保证输入字符串优先
|
||
|
result = append(result, filteredWhole...)
|
||
|
return result
|
||
|
}
|
||
|
|
||
|
// isChineseOrSpace 判断给定的rune字符是否为中文或空格
|
||
|
func (w *Words) isChineseOrSpace(r rune) bool {
|
||
|
return r == ' ' || unicode.Is(unicode.Scripts["Han"], r)
|
||
|
}
|
||
|
|
||
|
// CheckSurroundingChineseOrSpace 检查子串sub前后的UTF-8字符是否为中文或空格
|
||
|
func (w *Words) checkSurroundingChineseOrSpace(str []rune, sub []rune) (bool, bool) {
|
||
|
// 找到子串在父串中的位置
|
||
|
startIndex := -1
|
||
|
lenStr := len(str)
|
||
|
lenSub := len(sub)
|
||
|
for i := 0; i <= lenStr-lenSub; i++ {
|
||
|
if str[i] == sub[0] { // 找到子串的起始字符
|
||
|
match := true
|
||
|
for j := 1; j < lenSub; j++ {
|
||
|
if i+j >= lenStr || str[i+j] != sub[j] {
|
||
|
match = false
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
if match {
|
||
|
startIndex = i
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if startIndex == -1 {
|
||
|
// 子串不在父串中
|
||
|
return false, false
|
||
|
}
|
||
|
|
||
|
// 检查子串前后的字符是否为中文或空格
|
||
|
var isBeforeChineseOrSpace, isAfterChineseOrSpace bool
|
||
|
if startIndex > 0 {
|
||
|
isBeforeChineseOrSpace = w.isChineseOrSpace(str[startIndex-1])
|
||
|
}
|
||
|
if startIndex+len(sub) < len(str) {
|
||
|
isAfterChineseOrSpace = w.isChineseOrSpace(str[startIndex+len(sub)])
|
||
|
}
|
||
|
|
||
|
return isBeforeChineseOrSpace, isAfterChineseOrSpace
|
||
|
}
|
||
|
|
||
|
func (w *Words) separators() []string {
|
||
|
// 分隔符列表,与ES搜索的分隔符保持一致
|
||
|
return []string{
|
||
|
"~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")",
|
||
|
// "_",
|
||
|
"+",
|
||
|
|
||
|
"`", "-", "=",
|
||
|
"{", "}", "|",
|
||
|
"[", "]", "\\",
|
||
|
|
||
|
// ":",
|
||
|
"\"",
|
||
|
|
||
|
";",
|
||
|
// "'",
|
||
|
|
||
|
"<", ">", "?",
|
||
|
|
||
|
",",
|
||
|
// ".",
|
||
|
"/",
|
||
|
|
||
|
"\r\n", "\r", "\n", "\t",
|
||
|
|
||
|
"!", "¥", "…", "(", ")", "—",
|
||
|
|
||
|
// "·",
|
||
|
|
||
|
"【", "】", "、",
|
||
|
|
||
|
";",
|
||
|
// "‘",
|
||
|
// "’",
|
||
|
|
||
|
// ":",
|
||
|
"“",
|
||
|
|
||
|
"》", "《", "?",
|
||
|
",", "。", "、",
|
||
|
}
|
||
|
}
|