@@ -2,9 +2,9 @@ package pagetypeclassifier
22
33import (
44 _ "embed"
5- "sync"
6-
75 "fmt"
6+ "strings"
7+ "sync"
88
99 htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
1010 "github.com/microcosm-cc/bluemonday"
@@ -41,30 +41,29 @@ var (
4141 sanitizerPolicyOnce sync.Once
4242)
4343
44- // getSanitizerPolicy returns an aggressive HTML sanitizer policy that strips
45- // most elements to reduce nesting depth and prevent parser stack overflow.
44+ // getSanitizerPolicy returns an ultra- aggressive HTML sanitizer policy that strips
45+ // almost all elements to minimize nesting depth and prevent parser stack overflow.
4646func getSanitizerPolicy () * bluemonday.Policy {
4747 sanitizerPolicyOnce .Do (func () {
4848 p := bluemonday .NewPolicy ()
49- // Allow only basic text elements with minimal nesting
50- // This aggressive policy helps reduce nesting depth significantly
51- p .AllowElements ("p" , "br" , "div" , "span" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" )
52- p .AllowElements ("strong" , "em" , "b" , "i" , "u" )
53- p .AllowElements ("ul" , "ol" , "li" )
54- p .AllowElements ("blockquote" , "pre" , "code" )
55- // Allow basic attributes but no style (which can cause nesting issues)
56- p .AllowStandardAttributes ()
49+ // Ultra-aggressive policy: Allow only the most basic text elements
50+ // to minimize nesting and reduce parser stack depth
51+ p .AllowElements ("p" , "br" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" )
52+ p .AllowElements ("strong" , "em" , "b" , "i" )
53+ // Remove div, span, ul, ol, li as they can create deep nesting
54+ // No attributes allowed to prevent style-based nesting issues
5755 sanitizerPolicy = p
5856 })
5957 return sanitizerPolicy
6058}
6159
62- // htmlToText safely converts HTML to text and protects against panics from Go's HTML parser .
60+ // htmlToText safely converts HTML to text with multiple fallback strategies .
6361// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
6462// Strategy:
65- // 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
66- // 2. Convert sanitized HTML to markdown
67- // 3. If conversion panics, recover and return empty string with error
63+ // 1. Length limit the input HTML to prevent massive documents
64+ // 2. Sanitize HTML aggressively with bluemonday to reduce nesting
65+ // 3. Convert sanitized HTML to markdown with panic recovery
66+ // 4. If conversion fails, fallback to plain text extraction
6867func htmlToText (html string ) (text string , err error ) {
6968 defer func () {
7069 if r := recover (); r != nil {
@@ -73,19 +72,85 @@ func htmlToText(html string) (text string, err error) {
7372 }
7473 }()
7574
76- // First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
75+ // Limit input size to prevent processing extremely large HTML documents
76+ const maxHTMLSize = 1024 * 1024 // 1MB limit
77+ if len (html ) > maxHTMLSize {
78+ html = html [:maxHTMLSize ]
79+ }
80+
81+ // First, sanitize HTML with ultra-aggressive bluemonday policy
7782 sanitizedHTML := getSanitizerPolicy ().Sanitize (html )
7883
79- // If sanitization failed or produced empty result, return empty
84+ // If sanitization failed or produced empty result, try plain text fallback
8085 if sanitizedHTML == "" {
81- return "" , nil
86+ return extractPlainText ( html ) , nil
8287 }
8388
8489 // Convert sanitized HTML to markdown
8590 text , err = htmltomarkdown .ConvertString (sanitizedHTML )
86- if err != nil || text == "" {
87- return "" , err
91+ if err != nil {
92+ // If markdown conversion fails, fallback to plain text extraction
93+ return extractPlainText (sanitizedHTML ), nil
8894 }
95+
96+ if text == "" {
97+ // If result is empty, try plain text fallback
98+ return extractPlainText (sanitizedHTML ), nil
99+ }
100+
101+ return text , nil
102+ }
89103
90- return
104+ // extractPlainText is a simple fallback that extracts text content without HTML parsing
105+ // This is used when the HTML parser fails due to complexity or nesting depth
106+ func extractPlainText (html string ) string {
107+ // Simple regex-based text extraction as fallback
108+ // Remove script and style tags first
109+ text := html
110+
111+ // Remove script tags and content
112+ for {
113+ start := strings .Index (text , "<script" )
114+ if start == - 1 {
115+ break
116+ }
117+ end := strings .Index (text [start :], "</script>" )
118+ if end == - 1 {
119+ text = text [:start ]
120+ break
121+ }
122+ text = text [:start ] + text [start + end + 9 :]
123+ }
124+
125+ // Remove style tags and content
126+ for {
127+ start := strings .Index (text , "<style" )
128+ if start == - 1 {
129+ break
130+ }
131+ end := strings .Index (text [start :], "</style>" )
132+ if end == - 1 {
133+ text = text [:start ]
134+ break
135+ }
136+ text = text [:start ] + text [start + end + 8 :]
137+ }
138+
139+ // Simple HTML tag removal (not perfect but safe)
140+ result := ""
141+ inTag := false
142+ for _ , char := range text {
143+ if char == '<' {
144+ inTag = true
145+ } else if char == '>' {
146+ inTag = false
147+ result += " " // Replace tags with spaces
148+ } else if ! inTag {
149+ result += string (char )
150+ }
151+ }
152+
153+ // Clean up multiple spaces
154+ words := strings .Fields (result )
155+ return strings .Join (words , " " )
91156}
0 commit comments