revise digest add more complex config

This commit is contained in:
xing 2024-04-09 23:51:45 +08:00
parent a78815f3d3
commit ee9ba3fcf0
4 changed files with 265 additions and 49 deletions

View File

@ -22,6 +22,24 @@ var more = regexp.MustCompile("<!--more(.*?)?-->")
var removeWpBlock = regexp.MustCompile("<!-- /?wp:.*-->") var removeWpBlock = regexp.MustCompile("<!-- /?wp:.*-->")
type DigestConfig struct {
DigestWordCount int `yaml:"digestWordCount"`
DigestAllowTag string `yaml:"digestAllowTag"`
DigestRegex string `yaml:"digestRegex"`
DigestTagOccupyNum []struct {
Tag string `yaml:"tag"`
Num int `yaml:"num"`
ChuckOvered bool `yaml:"chuckOvered"`
EscapeCharacter []struct {
Tags string `yaml:"tags"`
Character []string `yaml:"character"`
Num int `yaml:"num"`
ChuckOvered bool `yaml:"chuckOvered"`
} `yaml:"escapeCharacter"`
} `yaml:"digestTagOccupyNum"`
specialSolve map[string]digest.SpecialSolveConf
}
var digestConfig *safety.Var[DigestConfig] var digestConfig *safety.Var[DigestConfig]
func InitDigestCache() { func InitDigestCache() {
@ -37,15 +55,57 @@ func InitDigestCache() {
c.DigestAllowTag = config.GetConfig().DigestAllowTag c.DigestAllowTag = config.GetConfig().DigestAllowTag
return c return c
} }
if len(c.DigestTagOccupyNum) > 0 { if c.DigestRegex != "" {
c.tagNum = map[string]int{} digest.SetQutos(c.DigestRegex)
for _, item := range c.DigestTagOccupyNum { }
tags := strings.Split(item.Tag, "<") if len(c.DigestTagOccupyNum) <= 1 {
for _, tag := range tags { return c
if tag == "" { }
continue c.specialSolve = map[string]digest.SpecialSolveConf{}
for _, item := range c.DigestTagOccupyNum {
tags := strings.Split(strings.ReplaceAll(item.Tag, " ", ""), "<")
for _, tag := range tags {
if tag == "" {
continue
}
tag = str.Join("<", tag)
var ec map[rune]digest.SpecialSolve
var specialTags map[string]digest.SpecialSolve
if len(item.EscapeCharacter) > 0 {
ec = make(map[rune]digest.SpecialSolve)
for _, esc := range item.EscapeCharacter {
for _, i := range esc.Character {
s := []rune(i)
if len(s) == 1 {
ec[s[0]] = digest.SpecialSolve{
Num: esc.Num,
ChuckOvered: esc.ChuckOvered,
}
}
}
if esc.Tags == "" {
continue
}
tagss := strings.Split(strings.ReplaceAll(esc.Tags, " ", ""), "<")
specialTags = make(map[string]digest.SpecialSolve)
for _, t := range tagss {
if t == "" {
continue
}
t = str.Join("<", t)
specialTags[t] = digest.SpecialSolve{
Num: esc.Num,
ChuckOvered: esc.ChuckOvered,
}
}
} }
c.tagNum[str.Join("<", tag)] = item.Num
}
c.specialSolve[tag] = digest.SpecialSolveConf{
Num: item.Num,
ChuckOvered: item.ChuckOvered,
EscapeCharacter: ec,
Tags: specialTags,
} }
} }
} }
@ -53,16 +113,6 @@ func InitDigestCache() {
}, "digestConfig") }, "digestConfig")
} }
type DigestConfig struct {
DigestWordCount int `yaml:"digestWordCount"`
DigestAllowTag string `yaml:"digestAllowTag"`
DigestTagOccupyNum []struct {
Tag string `yaml:"tag"`
Num int `yaml:"num"`
} `yaml:"digestTagOccupyNum"`
tagNum map[string]int
}
func RemoveWpBlock(s string) string { func RemoveWpBlock(s string) string {
return removeWpBlock.ReplaceAllString(s, "") return removeWpBlock.ReplaceAllString(s, "")
} }
@ -94,7 +144,12 @@ func Digests(content string, id uint64, limit int, fn func(id uint64, content, c
if length <= limit { if length <= limit {
return content return content
} }
content, closeTag = digest.Html(content, limit, c.tagNum) if len(c.specialSolve) > 0 {
content, closeTag = digest.CustomizeHtml(content, limit, c.specialSolve)
} else {
content, closeTag = digest.Html(content, limit)
}
if fn == nil { if fn == nil {
return PostsMore(id, content, closeTag) return PostsMore(id, content, closeTag)
} }

View File

@ -67,13 +67,46 @@ digestWordCount: 300
# 摘要允许的标签 默认为<a><b><blockquote><br><cite><code><dd><del><div><dl><dt><em><h1><h2><h3><h4><h5><h6><i><img><li><ol><p><pre><span><strong><ul> # 摘要允许的标签 默认为<a><b><blockquote><br><cite><code><dd><del><div><dl><dt><em><h1><h2><h3><h4><h5><h6><i><img><li><ol><p><pre><span><strong><ul>
digestTag: "<a><b><blockquote><br><cite><code><dd><del><div><dl><dt><em><h1><h2><h3><h4><h5><h6><i><img><li><ol><p><pre><span><strong><ul>" digestTag: "<a><b><blockquote><br><cite><code><dd><del><div><dl><dt><em><h1><h2><h3><h4><h5><h6><i><img><li><ol><p><pre><span><strong><ul>"
# 可以设置每个标签占用的字数默认都为0 set tag occupied num, default every tag occupied 0 # 设置html转义实体正则 the html coded character set regex file: plugin/digest/digest.go:12
digestTagOccupyNum: [ #digestRegex: "&quot;*|&amp;*|&lt;*|&gt;*|&nbsp;*|&#91;*|&#93;*|&emsp;*"
{
tag: "<img><table>", # 可以设置每个标签或者转义字符占用的字数默认都为0 set tag or escape character occupied num, default every tag occupied 0
num: 2 #digestTagOccupyNum: [
}, # {
] # tag: "<top>", # 最外层固定tag outermost immovable tag
# num: 0,
# chuckOvered: false,
# escapeCharacter: [
# {
# character: [ "\n","\r","\t" ],
# num: 0
# },
# ]
# },{
# tag: "<img>",
# num: 1,
# chuckOvered: false
# },
# {
# tag: "<pre><code>",
# num: 0,
# escapeCharacter: [
# {
# character: ["\t"],
# num: 4,
# chuckOvered: false,
# },
# {
# character: ["\n","\r"],
# num: 1
# },
# {
# tags: "<br>",
# num: 1
# },
# ]
# },
#]
# 到达指定并发请求数时随机sleep # 到达指定并发请求数时随机sleep
maxRequestSleepNum: 100 maxRequestSleepNum: 100

View File

@ -133,6 +133,13 @@ func StripTagsX(str, allowable string) string {
var selfCloseTags = map[string]string{"area": "", "base": "", "basefont": "", "br": "", "col": "", "command": "", "fecolormatrix": "", "embed": "", "frame": "", "hr": "", "img": "", "input": "", "isindex": "", "link": "", "fecomposite": "", "fefuncr": "", "fefuncg": "", "fefuncb": "", "fefunca": "", "meta": "", "param": "", "!doctype": "", "source": "", "track": "", "wbr": ""} var selfCloseTags = map[string]string{"area": "", "base": "", "basefont": "", "br": "", "col": "", "command": "", "fecolormatrix": "", "embed": "", "frame": "", "hr": "", "img": "", "input": "", "isindex": "", "link": "", "fecomposite": "", "fefuncr": "", "fefuncg": "", "fefuncb": "", "fefunca": "", "meta": "", "param": "", "!doctype": "", "source": "", "track": "", "wbr": ""}
func GetSelfCloseTags() map[string]string {
return selfCloseTags
}
func SetSelfCloseTags(m map[string]string) {
selfCloseTags = m
}
func CloseTag(str string) string { func CloseTag(str string) string {
tags := tag.FindAllString(str, -1) tags := tag.FindAllString(str, -1)
if len(tags) < 1 { if len(tags) < 1 {

View File

@ -9,7 +9,24 @@ import (
"unicode/utf8" "unicode/utf8"
) )
var quto = regexp.MustCompile(`&quot; *|&amp; *|&lt; *|&gt; ?|&nbsp; *`) var quto = regexp.MustCompile(`&quot;*|&amp;*|&lt;*|&gt;*|&nbsp;*|&#91;*|&#93;*|&emsp;*`)
func SetQutos(reg string) {
quto = regexp.MustCompile(reg)
}
type SpecialSolveConf struct {
Num int
ChuckOvered bool
EscapeCharacter map[rune]SpecialSolve
Tags map[string]SpecialSolve
}
type SpecialSolve struct {
Num int
ChuckOvered bool
}
var selfCloseTags = html.GetSelfCloseTags()
func StripTags(content, allowTag string) string { func StripTags(content, allowTag string) string {
content = strings.Trim(content, " \t\n\r\000\x0B") content = strings.Trim(content, " \t\n\r\000\x0B")
@ -18,7 +35,7 @@ func StripTags(content, allowTag string) string {
return content return content
} }
func Html(content string, limit int, m map[string]int) (string, string) { func Html(content string, limit int) (string, string) {
closeTag := "" closeTag := ""
length := utf8.RuneCountInString(content) + 1 length := utf8.RuneCountInString(content) + 1
if length <= limit { if length <= limit {
@ -39,19 +56,16 @@ func Html(content string, limit int, m map[string]int) (string, string) {
total := len(ru) total := len(ru)
l, r := '<', '>' l, r := '<', '>'
i := -1 i := -1
var tag []rune
for { for {
i++ i++
for len(runeIndex) > 0 && i >= runeIndex[0][0] { if end >= limit || i >= total {
ints := runeIndex[0] break
if ints[0] <= i { }
i = ints[1] for len(runeIndex) > 0 && i == runeIndex[0][0] {
runeIndex = runeIndex[1:] i = runeIndex[0][1]
end++ runeIndex = runeIndex[1:]
continue end++
} else { continue
break
}
} }
if end >= limit || i >= total { if end >= limit || i >= total {
@ -62,19 +76,10 @@ func Html(content string, limit int, m map[string]int) (string, string) {
continue continue
} else if ru[i] == r { } else if ru[i] == r {
tagIn = false tagIn = false
if len(m) > 0 {
tags := str.Join("<", strings.Split(string(tag), " ")[0], ">")
tag = tag[:0]
if n, ok := m[tags]; ok && n > 0 {
end += n
}
}
continue continue
} }
if tagIn == false { if tagIn == false && ru[i] != '\n' {
end++ end++
} else if len(m) > 0 {
tag = append(tag, ru[i])
} }
} }
if i > total { if i > total {
@ -84,3 +89,119 @@ func Html(content string, limit int, m map[string]int) (string, string) {
closeTag = html.CloseTag(content) closeTag = html.CloseTag(content)
return content, closeTag return content, closeTag
} }
func CustomizeHtml(content string, limit int, m map[string]SpecialSolveConf) (string, string) {
closeTag := ""
length := utf8.RuneCountInString(content) + 1
if length <= limit {
return content, ""
}
index := quto.FindAllStringIndex(content, -1)
var runeIndex [][]int
if len(index) > 0 {
runeIndex = slice.Map(index, func(t []int) []int {
return slice.Map(t, func(i int) int {
return utf8.RuneCountInString(content[:i])
})
})
}
count := 0
runeContent := []rune(content)
tagIn := false
runeTotal := len(runeContent)
l, r := '<', '>'
i := -1
var currentTag, parentTag string
var allTags = []string{"<top>"}
var tag []rune
var tagLocal = 0
for {
i++
if count >= limit || i >= runeTotal {
break
}
for len(runeIndex) > 0 && i == runeIndex[0][0] {
i = runeIndex[0][1]
runeIndex = runeIndex[1:]
count++
continue
}
if count >= limit || i >= runeTotal {
break
}
if runeContent[i] == l {
tagLocal = i
tagIn = true
continue
}
if tagIn && runeContent[i] == r {
tagIn = false
tags := str.Join("<", string(tag), ">")
if strings.Contains(tags, " ") {
tags = str.Join("<", strings.Split(string(tag), " ")[0], ">")
}
currentTag = tags
rawTag := strings.ReplaceAll(strings.Trim(tags, "<>"), "/", "")
_, ok := selfCloseTags[rawTag]
if !ok {
if '/' == tags[1] {
parentTag = allTags[len(allTags)-2]
allTags = allTags[:len(allTags)-1]
} else {
parentTag = allTags[len(allTags)-1]
allTags = append(allTags, currentTag)
}
} else {
parentTag = allTags[len(allTags)-1]
}
tag = tag[:0]
if len(m) > 0 {
nn, ok := m[parentTag]
if ok {
if n, ok := nn.Tags[tags]; ok {
if (count+n.Num) > limit && n.ChuckOvered {
i = tagLocal
break
}
count += n.Num
continue
}
}
if n, ok := m[tags]; ok {
if (count+n.Num) > limit && n.ChuckOvered {
i = tagLocal
break
}
count += n.Num
}
}
continue
}
if tagIn {
tag = append(tag, runeContent[i])
continue
}
currentTags := allTags[len(allTags)-1]
mm, ok := m[currentTags]
if !ok {
count++
} else if len(mm.EscapeCharacter) > 0 {
if n, ok := mm.EscapeCharacter[runeContent[i]]; ok {
if (count+n.Num) > limit && n.ChuckOvered {
break
}
count += n.Num
} else {
count++
}
}
}
if i > runeTotal {
i = runeTotal
}
content = string(runeContent[:i])
closeTag = html.CloseTag(content)
return content, closeTag
}