fix: enhance HTML parser panic protection with multiple fallback strategies

ehsandeep · ehsandeep · commit a260b322f7bb · 2025-12-06T19:21:43.000+05:30
- Add ultra-aggressive HTML sanitization to reduce nesting depth - Implement size limiting (1MB) to prevent processing huge documents - Add plain text extraction fallback for complex HTML structures - Enhance panic recovery with comprehensive error handling - Remove deeply nestable elements (div, span, ul, ol, li) from sanitizer - Add comprehensive test coverage for edge cases Resolves HTML parser panic: 'html: open stack of elements exceeds 512 nodes' that occurred after switching to html-to-markdown/v2 library in PR #2255
diff --git a/common/pagetypeclassifier/pagetypeclassifier.go b/common/pagetypeclassifier/pagetypeclassifier.go
@@ -2,9 +2,9 @@ package pagetypeclassifier
 
 import (
 	_ "embed"
-	"sync"
-
 	"fmt"
+	"strings"
+	"sync"
 
 	htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
 	"github.com/microcosm-cc/bluemonday"
@@ -41,30 +41,29 @@ var (
 	sanitizerPolicyOnce sync.Once
 )
 
-// getSanitizerPolicy returns an aggressive HTML sanitizer policy that strips
-// most elements to reduce nesting depth and prevent parser stack overflow.
+// getSanitizerPolicy returns an ultra-aggressive HTML sanitizer policy that strips
+// almost all elements to minimize nesting depth and prevent parser stack overflow.
 func getSanitizerPolicy() *bluemonday.Policy {
 	sanitizerPolicyOnce.Do(func() {
 		p := bluemonday.NewPolicy()
-		// Allow only basic text elements with minimal nesting
-		// This aggressive policy helps reduce nesting depth significantly
-		p.AllowElements("p", "br", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6")
-		p.AllowElements("strong", "em", "b", "i", "u")
-		p.AllowElements("ul", "ol", "li")
-		p.AllowElements("blockquote", "pre", "code")
-		// Allow basic attributes but no style (which can cause nesting issues)
-		p.AllowStandardAttributes()
+		// Ultra-aggressive policy: Allow only the most basic text elements
+		// to minimize nesting and reduce parser stack depth
+		p.AllowElements("p", "br", "h1", "h2", "h3", "h4", "h5", "h6")
+		p.AllowElements("strong", "em", "b", "i")
+		// Remove div, span, ul, ol, li as they can create deep nesting
+		// No attributes allowed to prevent style-based nesting issues
 		sanitizerPolicy = p
 	})
 	return sanitizerPolicy
 }
 
-// htmlToText safely converts HTML to text and protects against panics from Go's HTML parser.
+// htmlToText safely converts HTML to text with multiple fallback strategies.
 // The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
 // Strategy:
-// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
-// 2. Convert sanitized HTML to markdown
-// 3. If conversion panics, recover and return empty string with error
+// 1. Length limit the input HTML to prevent massive documents
+// 2. Sanitize HTML aggressively with bluemonday to reduce nesting
+// 3. Convert sanitized HTML to markdown with panic recovery
+// 4. If conversion fails, fallback to plain text extraction
 func htmlToText(html string) (text string, err error) {
 	defer func() {
 		if r := recover(); r != nil {
@@ -73,19 +72,85 @@ func htmlToText(html string) (text string, err error) {
 		}
 	}()
 
-	// First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
+	// Limit input size to prevent processing extremely large HTML documents
+	const maxHTMLSize = 1024 * 1024 // 1MB limit
+	if len(html) > maxHTMLSize {
+		html = html[:maxHTMLSize]
+	}
+
+	// First, sanitize HTML with ultra-aggressive bluemonday policy
 	sanitizedHTML := getSanitizerPolicy().Sanitize(html)
 
-	// If sanitization failed or produced empty result, return empty
+	// If sanitization failed or produced empty result, try plain text fallback
 	if sanitizedHTML == "" {
-		return "", nil
+		return extractPlainText(html), nil
 	}
 
 	// Convert sanitized HTML to markdown
 	text, err = htmltomarkdown.ConvertString(sanitizedHTML)
-	if err != nil || text == "" {
-		return "", err
+	if err != nil {
+		// If markdown conversion fails, fallback to plain text extraction
+		return extractPlainText(sanitizedHTML), nil
 	}
+	
+	if text == "" {
+		// If result is empty, try plain text fallback
+		return extractPlainText(sanitizedHTML), nil
+	}
+
+	return text, nil
+}
 
-	return
+// extractPlainText is a simple fallback that extracts text content without HTML parsing
+// This is used when the HTML parser fails due to complexity or nesting depth
+func extractPlainText(html string) string {
+	// Simple regex-based text extraction as fallback
+	// Remove script and style tags first
+	text := html
+	
+	// Remove script tags and content
+	for {
+		start := strings.Index(text, "<script")
+		if start == -1 {
+			break
+		}
+		end := strings.Index(text[start:], "</script>")
+		if end == -1 {
+			text = text[:start]
+			break
+		}
+		text = text[:start] + text[start+end+9:]
+	}
+	
+	// Remove style tags and content
+	for {
+		start := strings.Index(text, "<style")
+		if start == -1 {
+			break
+		}
+		end := strings.Index(text[start:], "</style>")
+		if end == -1 {
+			text = text[:start]
+			break
+		}
+		text = text[:start] + text[start+end+8:]
+	}
+	
+	// Simple HTML tag removal (not perfect but safe)
+	result := ""
+	inTag := false
+	for _, char := range text {
+		if char == '<' {
+			inTag = true
+		} else if char == '>' {
+			inTag = false
+			result += " " // Replace tags with spaces
+		} else if !inTag {
+			result += string(char)
+		}
+	}
+	
+	// Clean up multiple spaces
+	words := strings.Fields(result)
+	return strings.Join(words, " ")
 }
diff --git a/common/pagetypeclassifier/pagetypeclassifier_test.go b/common/pagetypeclassifier/pagetypeclassifier_test.go
@@ -1,6 +1,7 @@
 package pagetypeclassifier
 
 import (
+	"strings"
 	"testing"
 
 	"github.com/stretchr/testify/require"
@@ -56,13 +57,13 @@ func TestPageTypeClassifier(t *testing.T) {
 		`))
 	})
 
-	t.Run("test panic recovery with deeply nested HTML", func(t *testing.T) {
+	t.Run("test resilience with deeply nested HTML", func(t *testing.T) {
 		epc, err := New()
 		require.NoError(t, err)
 		require.NotNil(t, epc)
 
-		// Generate deeply nested HTML that exceeds the 512 node stack limit
-		// This should trigger a panic in the HTML parser, which we recover from
+		// Generate deeply nested HTML that would have exceeded the 512 node stack limit
+		// With our enhanced sanitization and fallback mechanisms, this should now work
 		deeplyNestedHTML := "<div>"
 		for i := 0; i < 600; i++ {
 			deeplyNestedHTML += "<div><span>"
@@ -73,13 +74,15 @@ func TestPageTypeClassifier(t *testing.T) {
 		}
 		deeplyNestedHTML += "</div>"
 
-		// Should not panic and should return "other" when htmlToText returns empty string
+		// Should not panic and should successfully classify the content
 		result := epc.Classify(deeplyNestedHTML)
-		require.Equal(t, "other", result)
+		require.NotEmpty(t, result)
+		// Should be able to extract and classify the text content
+		require.NotEqual(t, "", result)
 	})
 
 	t.Run("test htmlToText with deeply nested HTML", func(t *testing.T) {
-		// Generate deeply nested HTML that exceeds the 512 node stack limit
+		// Generate deeply nested HTML that would have exceeded the 512 node stack limit
 		deeplyNestedHTML := "<div>"
 		for i := 0; i < 600; i++ {
 			deeplyNestedHTML += "<div><span>"
@@ -90,10 +93,11 @@ func TestPageTypeClassifier(t *testing.T) {
 		}
 		deeplyNestedHTML += "</div>"
 
-		// Should not panic and should return empty string with error on panic
+		// Should not panic and should successfully extract text with enhanced sanitization
 		result, err := htmlToText(deeplyNestedHTML)
-		require.Error(t, err)
-		require.Equal(t, "", result)
+		require.NoError(t, err)
+		require.NotEmpty(t, result)
+		require.Contains(t, result, "Some text content")
 	})
 
 	t.Run("test htmlToText with normal HTML", func(t *testing.T) {
@@ -102,4 +106,38 @@ func TestPageTypeClassifier(t *testing.T) {
 		require.NoError(t, err)
 		require.NotEmpty(t, result)
 	})
+
+	t.Run("test htmlToText with extremely large HTML", func(t *testing.T) {
+		// Create a very large HTML document (over 1MB)
+		largeContent := strings.Repeat("<p>This is a test paragraph with some content. ", 50000)
+		largeHTML := "<html><body>" + largeContent + "</body></html>"
+		
+		// Should handle large documents without panic
+		result, err := htmlToText(largeHTML)
+		require.NoError(t, err)
+		require.NotEmpty(t, result)
+	})
+
+	t.Run("test extractPlainText fallback", func(t *testing.T) {
+		htmlWithScriptAndStyle := `<html>
+			<head>
+				<style>body { color: red; }</style>
+				<script>alert('test');</script>
+			</head>
+			<body>
+				<h1>Title</h1>
+				<p>Some <strong>important</strong> content here</p>
+				<div><span>Nested content</span></div>
+			</body>
+		</html>`
+		
+		result := extractPlainText(htmlWithScriptAndStyle)
+		require.NotEmpty(t, result)
+		require.Contains(t, result, "Title")
+		require.Contains(t, result, "important")
+		require.Contains(t, result, "content")
+		// Should not contain script or style content
+		require.NotContains(t, result, "alert")
+		require.NotContains(t, result, "color: red")
+	})
 }