File tree Expand file tree Collapse file tree 1 file changed +8
-6
lines changed
common/pagetypeclassifier Expand file tree Collapse file tree 1 file changed +8
-6
lines changed Original file line number Diff line number Diff line change 77 "github.com/microcosm-cc/bluemonday"
88 htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
99 "github.com/projectdiscovery/utils/ml/naive_bayes"
10+ "fmt"
1011)
1112
1213//go:embed clf.gob
@@ -63,10 +64,11 @@ func getSanitizerPolicy() *bluemonday.Policy {
6364// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
6465// 2. Convert sanitized HTML to markdown
6566// 3. If conversion panics, recover and return empty string
66- func htmlToText (html string ) (string , error ) {
67+ func htmlToText (html string ) (text string , err error ) {
6768 defer func () {
6869 if r := recover (); r != nil {
69- // If anything panics, we'll return empty string
70+ err = fmt .Errorf ("html parser panic: %v" , r )
71+ text = ""
7072 }
7173 }()
7274
@@ -79,10 +81,10 @@ func htmlToText(html string) (string, error) {
7981 }
8082
8183 // Convert sanitized HTML to markdown
82- result , err : = htmltomarkdown .ConvertString (sanitizedHTML )
83- if err != nil || result == "" {
84- return "" , nil
84+ text , err = htmltomarkdown .ConvertString (sanitizedHTML )
85+ if err != nil || text == "" {
86+ return "" , err
8587 }
8688
87- return result , nil
89+ return
8890}
You can’t perform that action at this time.
0 commit comments