package htmlx import ( "regexp" "sort" "strings" "unicode/utf8" "git.cloudyne.io/go/hiscaler-gox/stringx" ) var ( // Strip regexp rxStrip = regexp.MustCompile(`(?s)||||| style=['"]+(.*)['"]+`) // Spaceless regexp rxSpaceless = regexp.MustCompile(`/>\s+|| style=['"]+(.*)['"]+`) rxCleanJavascript = regexp.MustCompile(`(?s)`) rxCleanComment = regexp.MustCompile(`(?s)`) rxCleanMeta = regexp.MustCompile(`(?s)`) ) type CleanMode uint32 const ( CleanModeCSS CleanMode = 1 << (10 - iota) // 包括元素内嵌样式 CleanModeJavascript CleanModeComment CleanModeMeta CleanModeSpace CleanModeAll = CleanModeCSS | CleanModeJavascript | CleanModeComment | CleanModeMeta | CleanModeSpace ) // Strip Clean html tags // https://stackoverflow.com/questions/55036156/how-to-replace-all-html-tag-with-empty-string-in-golang func Strip(html string) string { html = strings.TrimSpace(html) if html != "" { html = rxStrip.ReplaceAllString(html, "") } if html == "" { return "" } const ( htmlTagStart = 60 // Unicode `<` htmlTagEnd = 62 // Unicode `>` ) // Setup a string builder and allocate enough memory for the new string. var builder strings.Builder builder.Grow(len(html) + utf8.UTFMax) in := false // True if we are inside an HTML tag. start := 0 // The index of the previous start tag character `<` end := 0 // The index of the previous end tag character `>` for i, c := range html { // If this is the last character and we are not in an HTML tag, save it. if (i+1) == len(html) && end >= start && c != htmlTagStart && c != htmlTagEnd { builder.WriteString(html[end:]) } // Keep going if the character is not `<` or `>` if c != htmlTagStart && c != htmlTagEnd { continue } if c == htmlTagStart { // Only update the start if we are not in a tag. // This make sure we strip out `<
` not just `
` if !in { start = i } in = true // Write the valid string between the close and start of the two tags. builder.WriteString(html[end:start]) continue } // else c == htmlTagEnd in = false end = i + 1 } s := builder.String() if s != "" { s = strings.TrimSpace(Spaceless(s)) } return s } // Spaceless 移除多余的空格 func Spaceless(html string) string { html = stringx.RemoveExtraSpace(html) if html == "" { return "" } return rxSpaceless.ReplaceAllString(html, "><") } func Clean(html string, cleanMode CleanMode) string { if html == "" { return html } const n = 5 modes := [n]bool{} // css, javascript, comment, meta, space, all for i := 0; i < n; i++ { if cleanMode&(1< 1 { sort.Strings(keys) } return keys } for _, k := range fnSortedKeys(attributes) { sb.WriteString(" ") sb.WriteString(k) sb.WriteString(`="`) sb.WriteString(attributes[k]) sb.WriteString(`"`) } keys := fnSortedKeys(styles) if len(keys) > 0 { sb.WriteString(` style="`) for _, k := range keys { sb.WriteString(k) sb.WriteString(":") sb.WriteString(styles[k]) sb.WriteString(`;`) } sb.WriteString(`"`) } sb.WriteString(">") sb.WriteString(content) sb.WriteString("") return sb.String() }