words filter support

敏感词
This commit is contained in:
xxjwxc
2022-03-31 17:23:37 +08:00
parent 2f41840802
commit f6b3d2a143
5 changed files with 310 additions and 19 deletions

7
go.mod
View File

@@ -8,7 +8,6 @@ require (
github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394
github.com/bitly/go-simplejson v0.5.0
github.com/btcsuite/winsvc v1.0.0
github.com/garyburd/redigo v1.6.3
github.com/go-redis/redis/v8 v8.4.11
github.com/go-sql-driver/mysql v1.5.0
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0
@@ -17,7 +16,6 @@ require (
github.com/jander/golog v0.0.0-20150917071935-954a5be801fc
github.com/jinzhu/gorm v1.9.12
github.com/jroimartin/gocui v0.4.0
github.com/juju/ratelimit v1.0.1
github.com/kardianos/service v1.0.0
github.com/muesli/cache2go v0.0.0-20200423001931-a100c5aac93f
github.com/nicksnyder/go-i18n/v2 v2.0.3
@@ -28,14 +26,11 @@ require (
github.com/syndtr/goleveldb v1.0.0
github.com/wenzhenxi/gorsa v0.0.0-20210524035706-528c7050d703
github.com/xxjwxc/gowp v0.0.0-20200603130651-4d7368b0e285
github.com/yudeguang/iox v0.0.0-20180519090448-bffdb29c87c0 // indirect
github.com/yudeguang/ratelimit v0.0.0-20220109125206-af2bdcdaf64a
go.uber.org/ratelimit v0.2.0
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/zap v1.10.0
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
golang.org/x/text v0.3.3
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4
google.golang.org/grpc v1.29.1
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
gopkg.in/eapache/queue.v1 v1.1.0

13
go.sum
View File

@@ -5,8 +5,6 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/andres-erbsen/clock v0.0.0-20160526145045-9e14626cd129 h1:MzBOUgng9orim59UnfUTLRjMpd09C5uEVQ6RPGeCaVI=
github.com/andres-erbsen/clock v0.0.0-20160526145045-9e14626cd129/go.mod h1:rFgpPQZYZ8vdbc+48xibu8ALc3yeyd64IhHS+PU6Yyg=
github.com/ant0ine/go-json-rest v3.3.2+incompatible h1:nBixrkLFiDNAW0hauKDLc8yJI6XfrQumWvytE1Hk14E=
github.com/ant0ine/go-json-rest v3.3.2+incompatible/go.mod h1:q6aCt0GfU6LhpBsnZ/2U+mwe+0XB5WStbmwyoPfc+sk=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
@@ -54,8 +52,6 @@ github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHqu
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
github.com/garyburd/redigo v1.6.3 h1:HCeeRluvAgMusMomi1+6Y5dmFOdYV/JzoRrrbFlkGIc=
github.com/garyburd/redigo v1.6.3/go.mod h1:rTb6epsqigu3kYKBnaF028A7Tf/Aw5s0cqA47doKKqw=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE=
@@ -120,8 +116,6 @@ github.com/jinzhu/now v1.1.1/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/
github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
github.com/jroimartin/gocui v0.4.0 h1:52jnalstgmc25FmtGcWqa0tcbMEWS6RpFLsOIO+I+E8=
github.com/jroimartin/gocui v0.4.0/go.mod h1:7i7bbj99OgFHzo7kB2zPb8pXLqMBSQegY7azfqXMkyY=
github.com/juju/ratelimit v1.0.1 h1:+7AIFJVQ0EQgq/K9+0Krm7m530Du7tIz0METWzN0RgY=
github.com/juju/ratelimit v1.0.1/go.mod h1:qapgC/Gy+xNh9UxzV13HGGl/6UXNN+ct+vwSgWNm/qk=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/kardianos/service v1.0.0 h1:HgQS3mFfOlyntWX8Oke98JcJLqt1DBcHR4kxShpYef0=
github.com/kardianos/service v1.0.0/go.mod h1:8CzDhVuCuugtsHyZoTvsOBuvonN/UDBvl0kH+BUxvbo=
@@ -219,10 +213,6 @@ github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:
github.com/xxjwxc/gowp v0.0.0-20200603130651-4d7368b0e285 h1:gbdax2ZvHZwe8zxu7by/HMuDUS47iHR2zmEzlgAHBMw=
github.com/xxjwxc/gowp v0.0.0-20200603130651-4d7368b0e285/go.mod h1:yJ/fY5BorWARfDDsxBU/MyQTHc5MVyNcqBQQYD6MN0k=
github.com/xxjwxc/public v0.0.0-20200603115833-341beff27850/go.mod h1:fp3M+FEQrCgWD1fZ/PLwZkCTglf086OEhC9LcydAUnc=
github.com/yudeguang/iox v0.0.0-20180519090448-bffdb29c87c0 h1:EIjQmYpnyudINP5M6Y3hFT/AA9SEaZ6La0MtHRkb0X0=
github.com/yudeguang/iox v0.0.0-20180519090448-bffdb29c87c0/go.mod h1:/yeZ8yPyE9g4jM7Z8LPKwi1L9lDGmLGQ0ywR4rtdNdY=
github.com/yudeguang/ratelimit v0.0.0-20220109125206-af2bdcdaf64a h1:z/xYclBL+mgRK5R8RI/jkUuLFXFYjxWI4aaRLwxi85c=
github.com/yudeguang/ratelimit v0.0.0-20220109125206-af2bdcdaf64a/go.mod h1:NcFk/p88iJxUWYrlDIat7mJLufpsHExnYvxUkApkhJc=
go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
go.opentelemetry.io/otel v0.16.0 h1:uIWEbdeb4vpKPGITLsRVUS44L5oDbDUCZxn8lkxhmgw=
go.opentelemetry.io/otel v0.16.0/go.mod h1:e4GKElweB8W2gWUqbghw0B8t5MCTccc9212eNHnOHwA=
@@ -231,8 +221,6 @@ go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/multierr v1.1.0 h1:HoEmRHQPVSqub6w2z2d2EOVs2fjyFRGyofhKuyDq0QI=
go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/ratelimit v0.2.0 h1:UQE2Bgi7p2B85uP5dC2bbRtig0C+OeNRnNEafLjsLPA=
go.uber.org/ratelimit v0.2.0/go.mod h1:YYBV4e4naJvhpitQrWJu1vCpgB7CboMe0qhltKt6mUg=
go.uber.org/zap v1.10.0 h1:ORx85nbTijNz8ljznvCMR1ZBIPKFn3jQrag10X2AsuM=
go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
@@ -287,7 +275,6 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4 h1:SvFZT6jyqRaOeXpc5h/JSfZenJ2O330aBsf7JfSUXmQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

157
wordsfilter/node.go Normal file
View File

@@ -0,0 +1,157 @@
package wordsfilter
import (
"bytes"
"strings"
)
type Node struct {
Child map[string]*Node
Placeholders string
}
// New creates a node.
func NewNode(child map[string]*Node, placeholders string) *Node {
return &Node{
Child: child,
Placeholders: placeholders,
}
}
// Add sensitive words to specified sensitive words Map.
func (node *Node) add(text string, root map[string]*Node, placeholder string) {
if text == "" {
return
}
textr := []rune(text)
end := len(textr) - 1
for i := 0; i <= end; i++ {
word := string(textr[i])
if n, ok := root[word]; ok { // contains key
if i == end { // the last
n.Placeholders = strings.Repeat(placeholder, end+1)
} else {
if n.Child != nil {
root = n.Child
} else {
root = make(map[string]*Node)
n.Child = root
}
}
} else {
placeholders, child := "", make(map[string]*Node)
if i == end {
placeholders = strings.Repeat(placeholder, end+1)
}
root[word] = NewNode(child, placeholders)
root = child
}
}
}
// Remove specified sensitive words from sensitive word map.
func (node *Node) remove(text string, root map[string]*Node) {
textr := []rune(text)
end := len(textr) - 1
for i := 0; i <= end; i++ {
word := string(textr[i])
if n, ok := root[word]; ok {
if i == end {
n.Placeholders = ""
} else {
root = n.Child
}
} else {
return
}
}
}
// Replace sensitive words in strings and return new strings.
// Follow the principle of maximum matching.
func (node *Node) replace(text string, root map[string]*Node) string {
if root == nil || text == "" {
return text
}
textr := []rune(text)
i, s, e, l := 0, 0, 0, len(textr)
bf := bytes.Buffer{}
words := make(map[string]*Node)
var back []*Node
loop:
for e < l {
words = root
i = e
// Maximum Matching Principle, Matching Backwards First
for ; i < l; i ++ {
word := string(textr[i])
if n, ok := words[word]; ok {
back = append(back, n)
if n.Child != nil {
words = n.Child
} else if n.Placeholders != "" {
bf.WriteString(string(textr[s:e]))
bf.WriteString(n.Placeholders)
i++
s, e = i, i
continue loop
} else {
break
}
} else if n != nil && n.Placeholders != "" {
bf.WriteString(string(textr[s:e]))
bf.WriteString(n.Placeholders)
s, e = i, i
continue loop
} else {
break
}
}
// Backward match fails, backtracking.
for ; i > e; i-- {
bl := len(back)
if bl == 0 {
break
}
last := back[bl-1]
back = back[:bl-1]
if last.Placeholders != "" {
bf.WriteString(string(textr[s:e]))
bf.WriteString(last.Placeholders)
s, e = i, i
continue loop
}
}
e++
back = back[:0]
}
bf.WriteString(string(textr[s:e]))
return bf.String()
}
// Whether the string contains sensitive words.
func (node *Node) contains(text string, root map[string]*Node) bool {
if root == nil || text == "" {
return false
}
textr := []rune(text)
end := len(textr) - 1
for i := 0; i <= end; i++ {
word := string(textr[i])
if n, ok := root[word]; ok {
if i == end {
return n.Placeholders != ""
} else {
if len(n.Child) == 0 { // last
return true
}
root = n.Child
}
} else {
continue
}
}
return false
}

118
wordsfilter/words_filter.go Normal file
View File

@@ -0,0 +1,118 @@
package wordsfilter
import (
"bufio"
"bytes"
"io"
"os"
"strings"
"sync"
)
var DefaultPlaceholder = "*"
var DefaultStripSpace = true
type WordsFilter struct {
Placeholder string
StripSpace bool
node *Node
mutex sync.RWMutex
}
// New creates a words filter.
func New() *WordsFilter {
return &WordsFilter{
Placeholder: DefaultPlaceholder,
StripSpace: DefaultStripSpace,
node: NewNode(make(map[string]*Node), ""),
}
}
// Generate Convert sensitive text lists into sensitive word tree nodes
func (wf *WordsFilter) Generate(texts []string) map[string]*Node {
root := make(map[string]*Node)
for _, text := range texts {
wf.Add(text, root)
}
return root
}
// GenerateWithFile Convert sensitive text from file into sensitive word tree nodes.
// File content format, please wrap every sensitive word.
func (wf *WordsFilter) GenerateWithFile(path string) (map[string]*Node, error) {
fd, err := os.Open(path)
if err != nil {
return nil, err
}
defer fd.Close()
buf := bufio.NewReader(fd)
var texts []string
for {
line, _, err := buf.ReadLine()
if err != nil {
if err == io.EOF {
break
} else {
return nil, err
}
}
text := strings.TrimSpace(string(line))
if text == "" {
continue
}
texts = append(texts, text)
}
root := wf.Generate(texts)
return root, nil
}
// Add sensitive words to specified sensitive words Map.
func (wf *WordsFilter) Add(text string, root map[string]*Node) {
if wf.StripSpace {
text = stripSpace(text)
}
wf.mutex.Lock()
defer wf.mutex.Unlock()
wf.node.add(text, root, wf.Placeholder)
}
// Replace sensitive words in strings and return new strings.
func (wf *WordsFilter) Replace(text string, root map[string]*Node) string {
if wf.StripSpace {
text = stripSpace(text)
}
wf.mutex.RLock()
defer wf.mutex.RUnlock()
return wf.node.replace(text, root)
}
// Contains Whether the string contains sensitive words.
func (wf *WordsFilter) Contains(text string, root map[string]*Node) bool {
if wf.StripSpace {
text = stripSpace(text)
}
wf.mutex.RLock()
defer wf.mutex.RUnlock()
return wf.node.contains(text, root)
}
// Remove specified sensitive words from sensitive word map.
func (wf *WordsFilter) Remove(text string, root map[string]*Node) {
if wf.StripSpace {
text = stripSpace(text)
}
wf.mutex.Lock()
defer wf.mutex.Unlock()
wf.node.remove(text, root)
}
// stripSpace Strip space
func stripSpace(str string) string {
fields := strings.Fields(str)
var bf bytes.Buffer
for _, field := range fields {
bf.WriteString(field)
}
return bf.String()
}

View File

@@ -0,0 +1,34 @@
package wordsfilter
import (
"testing"
)
func TestWordsFilter(t *testing.T) {
texts := []string{
"Miyamoto Musashi",
"妲己",
"アンジェラ",
"ความรุ่งโรจน์",
}
wf := New()
root := wf.Generate(texts)
wf.Remove("shif", root)
c1 := wf.Contains("アン", root) // 是否有敏感词
if c1 != false {
t.Errorf("Test Contains expect false, get %T, %v", c1, c1)
}
c2 := wf.Contains("->アンジェラ2333", root)
if c2 != true {
t.Errorf("Test Contains expect true, get %T, %v", c2, c2)
}
r1 := wf.Replace("Game ความรุ่งโรจน์ i like 妲己 heroMiyamotoMusashi", root)
if r1 != "Game*************ilike**hero***************" {
t.Errorf("Test Replace expect Game*************ilike**hero***************,get %T,%v", r1, r1)
}
// Test generated with file.
root, _ = wf.GenerateWithFile("./words_test.txt")
if wf.Contains("アンジェラ", root) != true {
t.Errorf("Test Contains expect true, get %T, %v", c2, c2)
}
}