golines -w src

This commit is contained in:
nkanaev
2026-04-25 22:45:33 +01:00
parent f01c26b2c2
commit f1bdbbc0af
14 changed files with 206 additions and 49 deletions

View File

@@ -27,10 +27,16 @@ var (
blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`)
okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
unlikelyCandidatesRegexp = regexp.MustCompile(
`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`,
)
negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`)
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
negativeRegexp = regexp.MustCompile(
`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`,
)
positiveRegexp = regexp.MustCompile(
`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`,
)
)
type nodeScores map[*html.Node]float32

View File

@@ -146,7 +146,10 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
}
attrNames = append(attrNames, attribute.Key)
htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)))
htmlAttrs = append(
htmlAttrs,
fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)),
)
}
extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
@@ -161,11 +164,25 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
func getExtraAttributes(tagName string) ([]string, []string) {
switch tagName {
case "a":
return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
return []string{
"rel",
"target",
"referrerpolicy",
}, []string{
`rel="noopener noreferrer"`,
`target="_blank"`,
`referrerpolicy="no-referrer"`,
}
case "video", "audio":
return []string{"controls"}, []string{"controls"}
case "iframe":
return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups"`, `loading="lazy"`}
return []string{
"sandbox",
"loading",
}, []string{
`sandbox="allow-scripts allow-same-origin allow-popups"`,
`loading="lazy"`,
}
case "img":
return []string{"loading"}, []string{`loading="lazy"`, `referrerpolicy="no-referrer"`}
default: