分词调整

This commit is contained in:
gotoeasy 2022-06-30 21:24:05 +08:00
parent 8b72662de6
commit 06b3ad1bb8

View File

@ -5,7 +5,6 @@ package tokenizer
import ( import (
"glc/ldb/conf" "glc/ldb/conf"
"log"
"os" "os"
"regexp" "regexp"
"strings" "strings"
@ -55,7 +54,7 @@ func CutForSearchEx(text string, addWords []string, delWords []string) []string
txt := strings.ToLower(text) txt := strings.ToLower(text)
// 结巴分词 // 结巴分词
sch := seg.CutForSearch(txt, true) sch := seg.CutForSearch(txt+" "+strings.Join(addWords, " "), true) // TODO 暂且补丁
var mapStr = make(map[string]string) var mapStr = make(map[string]string)
tmp := "" tmp := ""
for word := range sch { for word := range sch {
@ -67,8 +66,8 @@ func CutForSearchEx(text string, addWords []string, delWords []string) []string
// 简单分词 // 简单分词
if simpleCutMode { if simpleCutMode {
// 针对日志再保留特殊字符(【.】用于包名,【/】用工于路径或日期【_】常用于表名 // 针对日志再保留特殊字符(【.】用于包名,【/】用工于路径或日期【_】常用于表名,【-】常用于日期或连词
txt = replaceByRegex(txt, "[,/;\\-\"'??,。!=@#\\[\\]【】\\\\:]", " ") // 【-】相对杂乱会出现少许冲突,不方便保留 txt = replaceByRegex(txt, "[,/;\"'??,。!=@#\\[\\]【】\\\\:]", " ")
//log.Println(txt) //log.Println(txt)
keys := strings.Split(txt, " ") keys := strings.Split(txt, " ")
for _, word := range keys { for _, word := range keys {
@ -116,35 +115,35 @@ func replaceByRegex(str string, rule string, replace string) string {
return reg.ReplaceAllString(str, replace) return reg.ReplaceAllString(str, replace)
} }
// 检索用文字进行分词,以及针对检索特殊场景的优化 // // 检索用文字进行分词,以及针对检索特殊场景的优化
func GetSearchKey(searchKey string) string { // func GetSearchKey(searchKey string) string {
if searchKey == "" { // if searchKey == "" {
return "" // return ""
} // }
var mapKey = make(map[string]string) // var mapKey = make(map[string]string)
kws := CutForSearch(searchKey) // kws := CutForSearch(searchKey)
for _, k := range kws { // for _, k := range kws {
mapKey[k] = "" // mapKey[k] = ""
} // }
for _, kw := range kws { // for _, kw := range kws {
ks := CutForSearch(kw) // ks := CutForSearch(kw)
if len(ks) > 1 { // if len(ks) > 1 {
for _, k := range ks { // for _, k := range ks {
delete(mapKey, k) // delete(mapKey, k)
} // }
mapKey[kw] = "" // mapKey[kw] = ""
} // }
} // }
var rs []string // var rs []string
for k := range mapKey { // for k := range mapKey {
rs = append(rs, k) // rs = append(rs, k)
} // }
// TODO // // TODO
log.Println("搜索关键词", kws, "优化后搜索", rs) // log.Println("搜索关键词", kws, "优化后搜索", rs)
return strings.Join(rs, " ") // return strings.Join(rs, " ")
} // }