Skip to content

Commit a260b32

Browse files
committed
fix: enhance HTML parser panic protection with multiple fallback strategies
- Add ultra-aggressive HTML sanitization to reduce nesting depth - Implement size limiting (1MB) to prevent processing huge documents - Add plain text extraction fallback for complex HTML structures - Enhance panic recovery with comprehensive error handling - Remove deeply nestable elements (div, span, ul, ol, li) from sanitizer - Add comprehensive test coverage for edge cases Resolves HTML parser panic: 'html: open stack of elements exceeds 512 nodes' that occurred after switching to html-to-markdown/v2 library in PR #2255
1 parent 8b04cd9 commit a260b32

File tree

2 files changed

+134
-31
lines changed

2 files changed

+134
-31
lines changed

common/pagetypeclassifier/pagetypeclassifier.go

Lines changed: 87 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ package pagetypeclassifier
22

33
import (
44
_ "embed"
5-
"sync"
6-
75
"fmt"
6+
"strings"
7+
"sync"
88

99
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
1010
"github.com/microcosm-cc/bluemonday"
@@ -41,30 +41,29 @@ var (
4141
sanitizerPolicyOnce sync.Once
4242
)
4343

44-
// getSanitizerPolicy returns an aggressive HTML sanitizer policy that strips
45-
// most elements to reduce nesting depth and prevent parser stack overflow.
44+
// getSanitizerPolicy returns an ultra-aggressive HTML sanitizer policy that strips
45+
// almost all elements to minimize nesting depth and prevent parser stack overflow.
4646
func getSanitizerPolicy() *bluemonday.Policy {
4747
sanitizerPolicyOnce.Do(func() {
4848
p := bluemonday.NewPolicy()
49-
// Allow only basic text elements with minimal nesting
50-
// This aggressive policy helps reduce nesting depth significantly
51-
p.AllowElements("p", "br", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6")
52-
p.AllowElements("strong", "em", "b", "i", "u")
53-
p.AllowElements("ul", "ol", "li")
54-
p.AllowElements("blockquote", "pre", "code")
55-
// Allow basic attributes but no style (which can cause nesting issues)
56-
p.AllowStandardAttributes()
49+
// Ultra-aggressive policy: Allow only the most basic text elements
50+
// to minimize nesting and reduce parser stack depth
51+
p.AllowElements("p", "br", "h1", "h2", "h3", "h4", "h5", "h6")
52+
p.AllowElements("strong", "em", "b", "i")
53+
// Remove div, span, ul, ol, li as they can create deep nesting
54+
// No attributes allowed to prevent style-based nesting issues
5755
sanitizerPolicy = p
5856
})
5957
return sanitizerPolicy
6058
}
6159

62-
// htmlToText safely converts HTML to text and protects against panics from Go's HTML parser.
60+
// htmlToText safely converts HTML to text with multiple fallback strategies.
6361
// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
6462
// Strategy:
65-
// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
66-
// 2. Convert sanitized HTML to markdown
67-
// 3. If conversion panics, recover and return empty string with error
63+
// 1. Length limit the input HTML to prevent massive documents
64+
// 2. Sanitize HTML aggressively with bluemonday to reduce nesting
65+
// 3. Convert sanitized HTML to markdown with panic recovery
66+
// 4. If conversion fails, fallback to plain text extraction
6867
func htmlToText(html string) (text string, err error) {
6968
defer func() {
7069
if r := recover(); r != nil {
@@ -73,19 +72,85 @@ func htmlToText(html string) (text string, err error) {
7372
}
7473
}()
7574

76-
// First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
75+
// Limit input size to prevent processing extremely large HTML documents
76+
const maxHTMLSize = 1024 * 1024 // 1MB limit
77+
if len(html) > maxHTMLSize {
78+
html = html[:maxHTMLSize]
79+
}
80+
81+
// First, sanitize HTML with ultra-aggressive bluemonday policy
7782
sanitizedHTML := getSanitizerPolicy().Sanitize(html)
7883

79-
// If sanitization failed or produced empty result, return empty
84+
// If sanitization failed or produced empty result, try plain text fallback
8085
if sanitizedHTML == "" {
81-
return "", nil
86+
return extractPlainText(html), nil
8287
}
8388

8489
// Convert sanitized HTML to markdown
8590
text, err = htmltomarkdown.ConvertString(sanitizedHTML)
86-
if err != nil || text == "" {
87-
return "", err
91+
if err != nil {
92+
// If markdown conversion fails, fallback to plain text extraction
93+
return extractPlainText(sanitizedHTML), nil
8894
}
95+
96+
if text == "" {
97+
// If result is empty, try plain text fallback
98+
return extractPlainText(sanitizedHTML), nil
99+
}
100+
101+
return text, nil
102+
}
89103

90-
return
104+
// extractPlainText is a simple fallback that extracts text content without HTML parsing
105+
// This is used when the HTML parser fails due to complexity or nesting depth
106+
func extractPlainText(html string) string {
107+
// Simple regex-based text extraction as fallback
108+
// Remove script and style tags first
109+
text := html
110+
111+
// Remove script tags and content
112+
for {
113+
start := strings.Index(text, "<script")
114+
if start == -1 {
115+
break
116+
}
117+
end := strings.Index(text[start:], "</script>")
118+
if end == -1 {
119+
text = text[:start]
120+
break
121+
}
122+
text = text[:start] + text[start+end+9:]
123+
}
124+
125+
// Remove style tags and content
126+
for {
127+
start := strings.Index(text, "<style")
128+
if start == -1 {
129+
break
130+
}
131+
end := strings.Index(text[start:], "</style>")
132+
if end == -1 {
133+
text = text[:start]
134+
break
135+
}
136+
text = text[:start] + text[start+end+8:]
137+
}
138+
139+
// Simple HTML tag removal (not perfect but safe)
140+
result := ""
141+
inTag := false
142+
for _, char := range text {
143+
if char == '<' {
144+
inTag = true
145+
} else if char == '>' {
146+
inTag = false
147+
result += " " // Replace tags with spaces
148+
} else if !inTag {
149+
result += string(char)
150+
}
151+
}
152+
153+
// Clean up multiple spaces
154+
words := strings.Fields(result)
155+
return strings.Join(words, " ")
91156
}

common/pagetypeclassifier/pagetypeclassifier_test.go

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package pagetypeclassifier
22

33
import (
4+
"strings"
45
"testing"
56

67
"github.com/stretchr/testify/require"
@@ -56,13 +57,13 @@ func TestPageTypeClassifier(t *testing.T) {
5657
`))
5758
})
5859

59-
t.Run("test panic recovery with deeply nested HTML", func(t *testing.T) {
60+
t.Run("test resilience with deeply nested HTML", func(t *testing.T) {
6061
epc, err := New()
6162
require.NoError(t, err)
6263
require.NotNil(t, epc)
6364

64-
// Generate deeply nested HTML that exceeds the 512 node stack limit
65-
// This should trigger a panic in the HTML parser, which we recover from
65+
// Generate deeply nested HTML that would have exceeded the 512 node stack limit
66+
// With our enhanced sanitization and fallback mechanisms, this should now work
6667
deeplyNestedHTML := "<div>"
6768
for i := 0; i < 600; i++ {
6869
deeplyNestedHTML += "<div><span>"
@@ -73,13 +74,15 @@ func TestPageTypeClassifier(t *testing.T) {
7374
}
7475
deeplyNestedHTML += "</div>"
7576

76-
// Should not panic and should return "other" when htmlToText returns empty string
77+
// Should not panic and should successfully classify the content
7778
result := epc.Classify(deeplyNestedHTML)
78-
require.Equal(t, "other", result)
79+
require.NotEmpty(t, result)
80+
// Should be able to extract and classify the text content
81+
require.NotEqual(t, "", result)
7982
})
8083

8184
t.Run("test htmlToText with deeply nested HTML", func(t *testing.T) {
82-
// Generate deeply nested HTML that exceeds the 512 node stack limit
85+
// Generate deeply nested HTML that would have exceeded the 512 node stack limit
8386
deeplyNestedHTML := "<div>"
8487
for i := 0; i < 600; i++ {
8588
deeplyNestedHTML += "<div><span>"
@@ -90,10 +93,11 @@ func TestPageTypeClassifier(t *testing.T) {
9093
}
9194
deeplyNestedHTML += "</div>"
9295

93-
// Should not panic and should return empty string with error on panic
96+
// Should not panic and should successfully extract text with enhanced sanitization
9497
result, err := htmlToText(deeplyNestedHTML)
95-
require.Error(t, err)
96-
require.Equal(t, "", result)
98+
require.NoError(t, err)
99+
require.NotEmpty(t, result)
100+
require.Contains(t, result, "Some text content")
97101
})
98102

99103
t.Run("test htmlToText with normal HTML", func(t *testing.T) {
@@ -102,4 +106,38 @@ func TestPageTypeClassifier(t *testing.T) {
102106
require.NoError(t, err)
103107
require.NotEmpty(t, result)
104108
})
109+
110+
t.Run("test htmlToText with extremely large HTML", func(t *testing.T) {
111+
// Create a very large HTML document (over 1MB)
112+
largeContent := strings.Repeat("<p>This is a test paragraph with some content. ", 50000)
113+
largeHTML := "<html><body>" + largeContent + "</body></html>"
114+
115+
// Should handle large documents without panic
116+
result, err := htmlToText(largeHTML)
117+
require.NoError(t, err)
118+
require.NotEmpty(t, result)
119+
})
120+
121+
t.Run("test extractPlainText fallback", func(t *testing.T) {
122+
htmlWithScriptAndStyle := `<html>
123+
<head>
124+
<style>body { color: red; }</style>
125+
<script>alert('test');</script>
126+
</head>
127+
<body>
128+
<h1>Title</h1>
129+
<p>Some <strong>important</strong> content here</p>
130+
<div><span>Nested content</span></div>
131+
</body>
132+
</html>`
133+
134+
result := extractPlainText(htmlWithScriptAndStyle)
135+
require.NotEmpty(t, result)
136+
require.Contains(t, result, "Title")
137+
require.Contains(t, result, "important")
138+
require.Contains(t, result, "content")
139+
// Should not contain script or style content
140+
require.NotContains(t, result, "alert")
141+
require.NotContains(t, result, "color: red")
142+
})
105143
}

0 commit comments

Comments
 (0)