@@ -2,7 +2,9 @@ package pagetypeclassifier
22
33import (
44 _ "embed"
5+ "sync"
56
7+ "github.com/microcosm-cc/bluemonday"
68 htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
79 "github.com/projectdiscovery/utils/ml/naive_bayes"
810)
@@ -30,7 +32,57 @@ func (n *PageTypeClassifier) Classify(html string) string {
3032 return n .classifier .Classify (text )
3133}
3234
35+ var (
36+ // sanitizerPolicy is an aggressive bluemonday policy that strips most HTML
37+ // to reduce nesting depth and prevent parser stack overflow
38+ sanitizerPolicy * bluemonday.Policy
39+ sanitizerPolicyOnce sync.Once
40+ )
41+
42+ // getSanitizerPolicy returns an aggressive HTML sanitizer policy that strips
43+ // most elements to reduce nesting depth and prevent parser stack overflow.
44+ func getSanitizerPolicy () * bluemonday.Policy {
45+ sanitizerPolicyOnce .Do (func () {
46+ p := bluemonday .NewPolicy ()
47+ // Allow only basic text elements with minimal nesting
48+ // This aggressive policy helps reduce nesting depth significantly
49+ p .AllowElements ("p" , "br" , "div" , "span" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" )
50+ p .AllowElements ("strong" , "em" , "b" , "i" , "u" )
51+ p .AllowElements ("ul" , "ol" , "li" )
52+ p .AllowElements ("blockquote" , "pre" , "code" )
53+ // Allow basic attributes but no style (which can cause nesting issues)
54+ p .AllowStandardAttributes ()
55+ sanitizerPolicy = p
56+ })
57+ return sanitizerPolicy
58+ }
59+
3360// htmlToText safely converts HTML to text and protects against panics from Go's HTML parser.
61+ // The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
62+ // Strategy:
63+ // 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
64+ // 2. Convert sanitized HTML to markdown
65+ // 3. If conversion panics, recover and return empty string
3466func htmlToText (html string ) (string , error ) {
35- return htmltomarkdown .ConvertString (html )
67+ defer func () {
68+ if r := recover (); r != nil {
69+ // If anything panics, we'll return empty string
70+ }
71+ }()
72+
73+ // First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
74+ sanitizedHTML := getSanitizerPolicy ().Sanitize (html )
75+
76+ // If sanitization failed or produced empty result, return empty
77+ if sanitizedHTML == "" {
78+ return "" , nil
79+ }
80+
81+ // Convert sanitized HTML to markdown
82+ result , err := htmltomarkdown .ConvertString (sanitizedHTML )
83+ if err != nil || result == "" {
84+ return "" , nil
85+ }
86+
87+ return result , nil
3688}
0 commit comments