Skip to content

Commit 0309110

Browse files
committed
adding panic guard + tests
1 parent 0658afd commit 0309110

File tree

2 files changed

+100
-2
lines changed

2 files changed

+100
-2
lines changed

common/pagetypeclassifier/pagetypeclassifier.go

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ package pagetypeclassifier
22

33
import (
44
_ "embed"
5+
"sync"
56

7+
"github.com/microcosm-cc/bluemonday"
68
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
79
"github.com/projectdiscovery/utils/ml/naive_bayes"
810
)
@@ -30,7 +32,57 @@ func (n *PageTypeClassifier) Classify(html string) string {
3032
return n.classifier.Classify(text)
3133
}
3234

35+
var (
36+
// sanitizerPolicy is an aggressive bluemonday policy that strips most HTML
37+
// to reduce nesting depth and prevent parser stack overflow
38+
sanitizerPolicy *bluemonday.Policy
39+
sanitizerPolicyOnce sync.Once
40+
)
41+
42+
// getSanitizerPolicy returns an aggressive HTML sanitizer policy that strips
43+
// most elements to reduce nesting depth and prevent parser stack overflow.
44+
func getSanitizerPolicy() *bluemonday.Policy {
45+
sanitizerPolicyOnce.Do(func() {
46+
p := bluemonday.NewPolicy()
47+
// Allow only basic text elements with minimal nesting
48+
// This aggressive policy helps reduce nesting depth significantly
49+
p.AllowElements("p", "br", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6")
50+
p.AllowElements("strong", "em", "b", "i", "u")
51+
p.AllowElements("ul", "ol", "li")
52+
p.AllowElements("blockquote", "pre", "code")
53+
// Allow basic attributes but no style (which can cause nesting issues)
54+
p.AllowStandardAttributes()
55+
sanitizerPolicy = p
56+
})
57+
return sanitizerPolicy
58+
}
59+
3360
// htmlToText safely converts HTML to text and protects against panics from Go's HTML parser.
61+
// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
62+
// Strategy:
63+
// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
64+
// 2. Convert sanitized HTML to markdown
65+
// 3. If conversion panics, recover and return empty string
3466
func htmlToText(html string) (string, error) {
35-
return htmltomarkdown.ConvertString(html)
67+
defer func() {
68+
if r := recover(); r != nil {
69+
// If anything panics, we'll return empty string
70+
}
71+
}()
72+
73+
// First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
74+
sanitizedHTML := getSanitizerPolicy().Sanitize(html)
75+
76+
// If sanitization failed or produced empty result, return empty
77+
if sanitizedHTML == "" {
78+
return "", nil
79+
}
80+
81+
// Convert sanitized HTML to markdown
82+
result, err := htmltomarkdown.ConvertString(sanitizedHTML)
83+
if err != nil || result == "" {
84+
return "", nil
85+
}
86+
87+
return result, nil
3688
}

common/pagetypeclassifier/pagetypeclassifier_test.go

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
)
88

99
func TestPageTypeClassifier(t *testing.T) {
10-
1110
t.Run("test creation of new PageTypeClassifier", func(t *testing.T) {
1211
epc, err := New()
1312
require.NoError(t, err)
@@ -56,4 +55,51 @@ func TestPageTypeClassifier(t *testing.T) {
5655
</html>
5756
`))
5857
})
58+
59+
t.Run("test panic recovery with deeply nested HTML", func(t *testing.T) {
60+
epc, err := New()
61+
require.NoError(t, err)
62+
require.NotNil(t, epc)
63+
64+
// Generate deeply nested HTML that exceeds the 512 node stack limit
65+
// This should trigger a panic in the HTML parser, which we recover from
66+
deeplyNestedHTML := "<div>"
67+
for i := 0; i < 600; i++ {
68+
deeplyNestedHTML += "<div><span>"
69+
}
70+
deeplyNestedHTML += "Some text content"
71+
for i := 0; i < 600; i++ {
72+
deeplyNestedHTML += "</span></div>"
73+
}
74+
deeplyNestedHTML += "</div>"
75+
76+
// Should not panic and should return "other" when htmlToText returns empty string
77+
result := epc.Classify(deeplyNestedHTML)
78+
require.Equal(t, "other", result)
79+
})
80+
81+
t.Run("test htmlToText with deeply nested HTML", func(t *testing.T) {
82+
// Generate deeply nested HTML that exceeds the 512 node stack limit
83+
deeplyNestedHTML := "<div>"
84+
for i := 0; i < 600; i++ {
85+
deeplyNestedHTML += "<div><span>"
86+
}
87+
deeplyNestedHTML += "Some text content"
88+
for i := 0; i < 600; i++ {
89+
deeplyNestedHTML += "</span></div>"
90+
}
91+
deeplyNestedHTML += "</div>"
92+
93+
// Should not panic and should return empty string on panic
94+
result, err := htmlToText(deeplyNestedHTML)
95+
require.NoError(t, err)
96+
require.Equal(t, "", result)
97+
})
98+
99+
t.Run("test htmlToText with normal HTML", func(t *testing.T) {
100+
normalHTML := `<html><body><h1>Title</h1><p>Some content here</p></body></html>`
101+
result, err := htmlToText(normalHTML)
102+
require.NoError(t, err)
103+
require.NotEmpty(t, result)
104+
})
59105
}

0 commit comments

Comments
 (0)