通用包
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
utils/simplify/words.go

181 lines
4.0 KiB

package simplify
import (
"strings"
"unicode"
)
// 模拟ES部分分词规则,对查询关键字进行去重
type Words struct {
whole []string
input []string
}
func NewWords(whole []string, input []string) *Words {
return &Words{whole: whole, input: input}
}
func (w *Words) Simplify() []string {
// 分隔符列表,与ES搜索的分隔符保持一致
separators := w.separators()
keywords := make([]string, 0)
keywords = append(keywords, w.input...)
keywords = append(keywords, w.whole...)
wides := make(map[string]string)
for _, keyword := range keywords {
// 转换为小写并去除两端空白字符,规避大小写造成异常剔除
keyword = strings.ToLower(strings.TrimSpace(keyword))
if keyword == "" {
continue
}
// 将分隔符替换为空格
str := keyword
for _, separator := range separators {
str = strings.ReplaceAll(str, separator, " ")
}
wide := " " + strings.Join(strings.Fields(str), " ") + " " // 空格开头结尾,便于包含判断
if strings.TrimSpace(wide) == "" {
continue
}
wides[keyword] = wide
}
// 计算重复次数
repetition := make(map[string]int)
for _, wide1 := range wides {
for keyword, wide2 := range wides {
// 首尾空格字符串 包含另一个 首尾空格字符串,则认为重复
if strings.Contains(wide2, wide1) {
repetition[keyword]++
continue
}
// 子串 在 首尾空格父串 中, 前后位置的字符 是 中文或空格,也认为重复
before, after := w.checkSurroundingChineseOrSpace([]rune(wide2), []rune(strings.TrimSpace(wide1)))
if before && after {
repetition[keyword]++
continue
}
}
}
// 筛选出不重复且不是输入的字符串
filteredInput := make([]string, 0)
filteredWhole := make([]string, 0)
for keyword, times := range repetition {
if times == 1 {
if w.contains(w.input, keyword) {
filteredInput = append(filteredInput, keyword)
} else {
filteredWhole = append(filteredWhole, keyword)
}
}
}
result := make([]string, 0)
result = append(result, filteredInput...) // ‘input’ 放在前面,保证输入字符串优先
result = append(result, filteredWhole...)
return result
}
func (w *Words) contains(arr []string, str string) bool {
for _, v := range arr {
if v == str {
return true
}
}
return false
}
// isChineseOrSpace 判断给定的rune字符是否为中文或空格
func (w *Words) isChineseOrSpace(r rune) bool {
return r == ' ' || unicode.Is(unicode.Scripts["Han"], r)
}
// CheckSurroundingChineseOrSpace 检查子串sub前后的UTF-8字符是否为中文或空格
func (w *Words) checkSurroundingChineseOrSpace(str []rune, sub []rune) (bool, bool) {
// 找到子串在父串中的位置
startIndex := -1
lenStr := len(str)
lenSub := len(sub)
for i := 0; i <= lenStr-lenSub; i++ {
if str[i] == sub[0] { // 找到子串的起始字符
match := true
for j := 1; j < lenSub; j++ {
if i+j >= lenStr || str[i+j] != sub[j] {
match = false
break
}
}
if match {
startIndex = i
break
}
}
}
if startIndex == -1 {
// 子串不在父串中
return false, false
}
// 检查子串前后的字符是否为中文或空格
var isBeforeChineseOrSpace, isAfterChineseOrSpace bool
if startIndex > 0 {
isBeforeChineseOrSpace = w.isChineseOrSpace(str[startIndex-1])
}
if startIndex+len(sub) < len(str) {
isAfterChineseOrSpace = w.isChineseOrSpace(str[startIndex+len(sub)])
}
return isBeforeChineseOrSpace, isAfterChineseOrSpace
}
func (w *Words) separators() []string {
// 分隔符列表,与ES搜索的分隔符保持一致
return []string{
"~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")",
// "_",
"+",
"`", "-", "=",
"{", "}", "|",
"[", "]", "\\",
// ":",
"\"",
";",
// "'",
"<", ">", "?",
",",
// ".",
"/",
"\r\n", "\r", "\n", "\t",
"!", "¥", "…", "(", ")", "—",
// "·",
"【", "】", "、",
";",
// "‘",
// "’",
// ":",
"“",
"》", "《", "?",
",", "。", "、",
}
}