package simplify

import (
	"slices"
	"strings"
	"unicode"
)

// 模拟ES部分分词规则，对查询关键字进行去重

type Words struct {
	whole []string
	input []string
}

func NewWords(whole []string, input []string) *Words {
	return &Words{whole: whole, input: input}
}

func (w *Words) Simplify() []string {

	// 分隔符列表，与ES搜索的分隔符保持一致
	separators := w.separators()

	keywords := make([]string, 0)
	keywords = append(keywords, w.input...)
	keywords = append(keywords, w.whole...)

	wides := make(map[string]string)
	for _, keyword := range keywords {
		// 转换为小写并去除两端空白字符，规避大小写造成异常剔除
		keyword = strings.ToLower(strings.TrimSpace(keyword))
		if keyword == "" {
			continue
		}
		// 将分隔符替换为空格
		str := keyword
		for _, separator := range separators {
			str = strings.ReplaceAll(str, separator, " ")
		}
		wide := " " + strings.Join(strings.Fields(str), " ") + " " // 空格开头结尾，便于包含判断
		if strings.TrimSpace(wide) == "" {
			continue
		}
		wides[keyword] = wide
	}

	// 计算重复次数
	repetition := make(map[string]int)
	for _, wide1 := range wides {
		for keyword, wide2 := range wides {

			// 首尾空格字符串 包含另一个 首尾空格字符串，则认为重复
			if strings.Contains(wide2, wide1) {
				repetition[keyword]++
				continue
			}

			// 子串 在 首尾空格父串 中， 前后位置的字符 是 中文或空格，也认为重复
			before, after := w.checkSurroundingChineseOrSpace([]rune(wide2), []rune(strings.TrimSpace(wide1)))
			if before && after {
				repetition[keyword]++
				continue
			}
		}
	}

	// 筛选出不重复且不是输入的字符串
	filteredInput := make([]string, 0)
	filteredWhole := make([]string, 0)
	for keyword, times := range repetition {
		if times == 1 {
			if slices.Contains(w.input, keyword) {
				filteredInput = append(filteredInput, keyword)
			} else {
				filteredWhole = append(filteredWhole, keyword)
			}
		}
	}

	result := make([]string, 0)
	result = append(result, filteredInput...) // ‘input’ 放在前面，保证输入字符串优先
	result = append(result, filteredWhole...)
	return result
}

// isChineseOrSpace 判断给定的rune字符是否为中文或空格
func (w *Words) isChineseOrSpace(r rune) bool {
	return r == ' ' || unicode.Is(unicode.Scripts["Han"], r)
}

// CheckSurroundingChineseOrSpace 检查子串sub前后的UTF-8字符是否为中文或空格
func (w *Words) checkSurroundingChineseOrSpace(str []rune, sub []rune) (bool, bool) {
	// 找到子串在父串中的位置
	startIndex := -1
	lenStr := len(str)
	lenSub := len(sub)
	for i := 0; i <= lenStr-lenSub; i++ {
		if str[i] == sub[0] { // 找到子串的起始字符
			match := true
			for j := 1; j < lenSub; j++ {
				if i+j >= lenStr || str[i+j] != sub[j] {
					match = false
					break
				}
			}
			if match {
				startIndex = i
				break
			}
		}
	}

	if startIndex == -1 {
		// 子串不在父串中
		return false, false
	}

	// 检查子串前后的字符是否为中文或空格
	var isBeforeChineseOrSpace, isAfterChineseOrSpace bool
	if startIndex > 0 {
		isBeforeChineseOrSpace = w.isChineseOrSpace(str[startIndex-1])
	}
	if startIndex+len(sub) < len(str) {
		isAfterChineseOrSpace = w.isChineseOrSpace(str[startIndex+len(sub)])
	}

	return isBeforeChineseOrSpace, isAfterChineseOrSpace
}

func (w *Words) separators() []string {
	// 分隔符列表，与ES搜索的分隔符保持一致
	return []string{
		"~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")",
		// "_",
		"+",

		"`", "-", "=",
		"{", "}", "|",
		"[", "]", "\\",

		// ":",
		"\"",

		";",
		// "'",

		"<", ">", "?",

		",",
		// ".",
		"/",

		"\r\n", "\r", "\n", "\t",

		"！", "￥", "…", "（", "）", "—",

		// "·",

		"【", "】", "、",

		"；",
		// "‘",
		// "’",

		// "：",
		"“",

		"》", "《", "？",
		"，", "。", "、",
	}
}