Created
Some checks failed
Go / build (push) Failing after 7s

This commit is contained in:
scheibling
2025-04-08 19:16:39 +02:00
commit b4eb50ab55
63 changed files with 7333 additions and 0 deletions

186
htmlx/html.go Normal file
View File

@@ -0,0 +1,186 @@
package htmlx
import (
"regexp"
"sort"
"strings"
"unicode/utf8"
"git.cloudyne.io/go/hiscaler-gox/stringx"
)
var (
// Strip regexp
rxStrip = regexp.MustCompile(`(?s)<sty(.*)/style>|<scr(.*)/script>|<link(.*)/>|<meta(.*)/>|<!--(.*)-->| style=['"]+(.*)['"]+`)
// Spaceless regexp
rxSpaceless = regexp.MustCompile(`/>\s+</`)
// Clean regexp
rxCleanCSS = regexp.MustCompile(`(?s)<sty(.*)/style>|<link(.*)/>| style=['"]+(.*)['"]+`)
rxCleanJavascript = regexp.MustCompile(`(?s)<script(.*)/script>`)
rxCleanComment = regexp.MustCompile(`(?s)<!--(.*)-->`)
rxCleanMeta = regexp.MustCompile(`(?s)<meta(.*)/>`)
)
type CleanMode uint32
const (
CleanModeCSS CleanMode = 1 << (10 - iota) // 包括元素内嵌样式
CleanModeJavascript
CleanModeComment
CleanModeMeta
CleanModeSpace
CleanModeAll = CleanModeCSS | CleanModeJavascript | CleanModeComment | CleanModeMeta | CleanModeSpace
)
// Strip Clean html tags
// https://stackoverflow.com/questions/55036156/how-to-replace-all-html-tag-with-empty-string-in-golang
func Strip(html string) string {
html = strings.TrimSpace(html)
if html != "" {
html = rxStrip.ReplaceAllString(html, "")
}
if html == "" {
return ""
}
const (
htmlTagStart = 60 // Unicode `<`
htmlTagEnd = 62 // Unicode `>`
)
// Setup a string builder and allocate enough memory for the new string.
var builder strings.Builder
builder.Grow(len(html) + utf8.UTFMax)
in := false // True if we are inside an HTML tag.
start := 0 // The index of the previous start tag character `<`
end := 0 // The index of the previous end tag character `>`
for i, c := range html {
// If this is the last character and we are not in an HTML tag, save it.
if (i+1) == len(html) && end >= start && c != htmlTagStart && c != htmlTagEnd {
builder.WriteString(html[end:])
}
// Keep going if the character is not `<` or `>`
if c != htmlTagStart && c != htmlTagEnd {
continue
}
if c == htmlTagStart {
// Only update the start if we are not in a tag.
// This make sure we strip out `<<br>` not just `<br>`
if !in {
start = i
}
in = true
// Write the valid string between the close and start of the two tags.
builder.WriteString(html[end:start])
continue
}
// else c == htmlTagEnd
in = false
end = i + 1
}
s := builder.String()
if s != "" {
s = strings.TrimSpace(Spaceless(s))
}
return s
}
// Spaceless 移除多余的空格
func Spaceless(html string) string {
html = stringx.RemoveExtraSpace(html)
if html == "" {
return ""
}
return rxSpaceless.ReplaceAllString(html, "><")
}
func Clean(html string, cleanMode CleanMode) string {
if html == "" {
return html
}
const n = 5
modes := [n]bool{} // css, javascript, comment, meta, space, all
for i := 0; i < n; i++ {
if cleanMode&(1<<uint(10-i)) != 0 {
modes[i] = true
}
}
if modes[n-1] {
html = Spaceless(rxStrip.ReplaceAllString(html, ""))
} else {
for i := 0; i < n-2; i++ {
if modes[i] {
switch i {
case 0:
html = rxCleanCSS.ReplaceAllString(html, "")
case 1:
html = rxCleanJavascript.ReplaceAllString(html, "")
case 2:
html = rxCleanComment.ReplaceAllString(html, "")
case 3:
html = rxCleanMeta.ReplaceAllString(html, "")
case 4:
html = Spaceless(html)
}
}
}
}
return html
}
func Tag(tag, content string, attributes, styles map[string]string) string {
var sb strings.Builder
sb.Grow(len(tag)*2 + len(content) + 5)
sb.WriteString("<")
sb.WriteString(tag)
fnSortedKeys := func(d map[string]string) []string {
n := len(d)
if n == 0 {
return nil
}
keys := make([]string, n)
i := 0
for k := range d {
keys[i] = k
i++
}
if i > 1 {
sort.Strings(keys)
}
return keys
}
for _, k := range fnSortedKeys(attributes) {
sb.WriteString(" ")
sb.WriteString(k)
sb.WriteString(`="`)
sb.WriteString(attributes[k])
sb.WriteString(`"`)
}
keys := fnSortedKeys(styles)
if len(keys) > 0 {
sb.WriteString(` style="`)
for _, k := range keys {
sb.WriteString(k)
sb.WriteString(":")
sb.WriteString(styles[k])
sb.WriteString(`;`)
}
sb.WriteString(`"`)
}
sb.WriteString(">")
sb.WriteString(content)
sb.WriteString("</")
sb.WriteString(tag)
sb.WriteString(">")
return sb.String()
}

200
htmlx/html_test.go Normal file
View File

@@ -0,0 +1,200 @@
package htmlx
import (
"github.com/stretchr/testify/assert"
"testing"
)
func TestStrip(t *testing.T) {
tests := []struct {
tag string
html string
expected string
}{
{"t0", "<div>hello</div>", "hello"},
{"t1", `
<div>hello</div>
`, "hello"},
{"t3", "<div style='font-size: 12px;'>hello</div>", "hello"},
{"t4", "<style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>", "hello"},
{"t4", `
<link rel='stylesheet' id='wp-block-library-css' href='https://www.example.com/style.min.css?ver=5.9.1' type='text/css' media='all' />
<style type="text/css">body {font-size: 12px}</style><!-- / See later. --><div style='font-size: 12px;'>hello</div>`, "hello"},
{"t5", `
<body class="nodata company_blog" style="">
<script> var toolbarSearchExt = '{"landingWord":[],"queryWord":"","tag":["function","class","filter","search"],"title":"Yii: 设置数据翻页"}';
</script>
<script src="https://g.csdnimg.cn/common/csdn-toolbar/csdn-toolbar.js" type="text/javascript"></script>
<script src="https://g.csdnimg.cn/common/csdn-toolbar/csdn-toolbar1.js" type="text/javascript"></script>
<script>
(function(){
var bp = document.createElement('script');
var curProtocol = window.location.protocol.split(':')[0];
if (curProtocol === 'https') {
bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
}
else {
bp.src = 'http://push.zhanzhang.baidu.com/push.js';
}
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(bp, s);
})();
</script>
<link rel="stylesheet" href="https://csdnimg.cn/release/blogv2/dist/pc/css/blog_code-01256533b5.min.css">
<link rel="stylesheet" href="https://csdnimg.cn/release/blogv2/dist/mdeditor/css/editerView/chart-3456820cac.css" /><div style='font-size: 12px;'>hello</div></body>`, "hello"},
{"t6", `<!-- show up to 2 reviews by default -->
<p>Custom flags for your garden are a great way to show your personality to your friends and neighbors. Design and turn it into an eye-catching flag all year round. This will be a beautiful addition to your yard and garden, also a simple sign to show your patriotism on Memorial Day, 4th of July or Veterans Day, Christmas holidays or any holiday of the year.
</p>`, "Custom flags for your garden are a great way to show your personality to your friends and neighbors. Design and turn it into an eye-catching flag all year round. This will be a beautiful addition to your yard and garden, also a simple sign to show your patriotism on Memorial Day, 4th of July or Veterans Day, Christmas holidays or any holiday of the year."},
{"t7", "&lt;div>hello<div>", "hello"},
{"t8", " <div> hello world <div>", "hello world"},
}
for _, test := range tests {
equal := Strip(test.html)
assert.Equal(t, test.expected, equal, test.tag)
}
}
func BenchmarkStrip(b *testing.B) {
for i := 0; i < b.N; i++ {
Strip(`<!-- show up to 2 reviews by default -->
<p>Custom flags for your garden are a great way to show your personality to your friends and neighbors. Design and turn it into an eye-catching flag all year round. This will be a beautiful addition to your yard and garden, also a simple sign to show your patriotism on Memorial Day, 4th of July or Veterans Day, Christmas holidays or any holiday of the year.
</p>`)
}
}
func TestSpaceless(t *testing.T) {
tests := []struct {
tag string
html string
expected string
}{
{"t0", "<div>hello</div>", "<div>hello</div>"},
{"t1", `
<div>hello</div>
`, "<div>hello</div>"},
{"t3", "<div style='font-size: 12px;'>hello</div>", "<div style='font-size: 12px;'>hello</div>"},
{"t4", "<style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>", "<style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>"},
{"t4", `
<link rel='stylesheet' id='wp-block-library-css' href='https://www.example.com/style.min.css?ver=5.9.1' type='text/css' media='all' />
<style type="text/css">body {font-size: 12px}</style><!-- / See later. --><div style='font-size: 12px;'>hello</div>`, `<link rel='stylesheet' id='wp-block-library-css' href='https://www.example.com/style.min.css?ver=5.9.1' type='text/css' media='all' />
<style type="text/css">body {font-size: 12px}</style><!-- / See later. --><div style='font-size: 12px;'>hello</div>`},
{"t7", "<div> hello </div> <span></span>", "<div> hello </div> <span></span>"},
{"t8", `<!-- show up to 2 reviews by default -->
<p>Custom flags for your garden are a great way to show your personality to your friends and neighbors. Design and turn it into an eye-catching flag all year round. This will be a beautiful addition to your yard and garden, also a simple sign to show your patriotism on Memorial Day, 4th of July or Veterans Day, Christmas holidays or any holiday of the year.
</p>`, `<!-- show up to 2 reviews by default --> <p>Custom flags for your garden are a great way to show your personality to your friends and neighbors. Design and turn it into an eye-catching flag all year round. This will be a beautiful addition to your yard and garden, also a simple sign to show your patriotism on Memorial Day, 4th of July or Veterans Day, Christmas holidays or any holiday of the year. </p>`},
}
for _, test := range tests {
html := Spaceless(test.html)
assert.Equal(t, test.expected, html, test.tag)
}
}
func TestClean(t *testing.T) {
tests := []struct {
tag string
html string
cleanMode CleanMode
expected string
}{
{"tcss1", "<div>hello</div>", CleanModeCSS, "<div>hello</div>"},
{"tcss2", "<style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>", CleanModeCSS, "<div>hello</div>"},
{"tjavascript1", `<script src="//www.a.com/1.8.5/blog.js" type='text/javascript'></script><style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>`, CleanModeJavascript, "<style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>"},
{"tcomment1", `<script src="//www.a.com/1.8.5/blog.js" type='text/javascript'></script><!--comment--><style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>`, CleanModeComment, "<script src=\"//www.a.com/1.8.5/blog.js\" type='text/javascript'></script><style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>"},
{"tcss,javascript,comment", `<script src="//www.a.com/1.8.5/blog.js" type='text/javascript'></script><!--comment--><style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>`, CleanModeCSS | CleanModeJavascript | CleanModeComment, "<div>hello</div>"},
{"tall1", `<script>alert("ddd")</script><style>body {font-size: 12px}</style><div style='font-size: 12px;'>hello</div>`, CleanModeAll, "<div>hello</div>"},
{"tall2", `<!-- show up to 2 reviews by default -->
<p>Product details: +++ Material: 100% Ceramic +++ Size: 11oz or 15oz +++ Dye Sublimation graphics for exceptional prints. +++ Dishwasher and microwave safe. +++ Image is printed on both sides of mug. +++ Printed in the U.S.A. +++ Shipping info: Shipping time is approximately 5-7 business days.
</p>`, CleanModeAll, "<p>Product details: +++ Material: 100% Ceramic +++ Size: 11oz or 15oz +++ Dye Sublimation graphics for exceptional prints. +++ Dishwasher and microwave safe. +++ Image is printed on both sides of mug. +++ Printed in the U.S.A. +++ Shipping info: Shipping time is approximately 5-7 business days. </p>"},
{"tall3", `<div> 1 2 </div> <div>2</div>`, CleanModeAll, `<div> 1 2 </div> <div>2</div>`},
}
for _, testCase := range tests {
html := Clean(testCase.html, testCase.cleanMode)
assert.Equal(t, testCase.expected, html, testCase.tag)
}
}
func TestTag(t *testing.T) {
tests := []struct {
tag string
elementTag string
content string
attributes map[string]string
styles map[string]string
expected string
}{
{"t0", "div", "hello", nil, nil, "<div>hello</div>"},
{"t1", "div", "hello", map[string]string{"id": "name"}, nil, `<div id="name">hello</div>`},
{"t1.1", "div", "hello", map[string]string{"id": "name", "name": "name"}, nil, `<div id="name" name="name">hello</div>`},
{"t2", "div", "hello", map[string]string{"id": "name", "data-tag": "123"}, map[string]string{"font-size": "1", "font-weight": "bold"}, `<div data-tag="123" id="name" style="font-size:1;font-weight:bold;">hello</div>`},
}
for _, test := range tests {
equal := Tag(test.elementTag, test.content, test.attributes, test.styles)
assert.Equal(t, test.expected, equal, test.tag)
}
}
func BenchmarkTag(b *testing.B) {
for i := 0; i < b.N; i++ {
Tag("div", "hello", map[string]string{"id": "name"}, map[string]string{"font-size": "1"})
}
}