diff --git a/go.mod b/go.mod
index c55fecf..cbc46b2 100644
--- a/go.mod
+++ b/go.mod
@@ -3,6 +3,7 @@ module github.com/nkanaev/yarr
go 1.16
require (
+ github.com/PuerkitoBio/goquery v1.5.1
github.com/mattn/go-sqlite3 v1.14.0
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e
golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13
diff --git a/go.sum b/go.sum
index 054e8f1..02859e5 100644
--- a/go.sum
+++ b/go.sum
@@ -1,4 +1,6 @@
+github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
+github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA=
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
diff --git a/src/scraper/readability.go b/src/scraper/readability.go
new file mode 100644
index 0000000..581c8af
--- /dev/null
+++ b/src/scraper/readability.go
@@ -0,0 +1,305 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ //"log"
+ "math"
+ "regexp"
+ "strings"
+
+ "github.com/PuerkitoBio/goquery"
+ "golang.org/x/net/html"
+)
+
+const (
+ defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
+)
+
+var (
+ divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
+ sentenceRegexp = regexp.MustCompile(`\.( |$)`)
+
+ blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`)
+ okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
+ unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
+
+ negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`)
+ positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
+)
+
+type candidate struct {
+ selection *goquery.Selection
+ score float32
+}
+
+func (c *candidate) Node() *html.Node {
+ return c.selection.Get(0)
+}
+
+func (c *candidate) String() string {
+ id, _ := c.selection.Attr("id")
+ class, _ := c.selection.Attr("class")
+
+ if id != "" && class != "" {
+ return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
+ } else if id != "" {
+ return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
+ } else if class != "" {
+ return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
+ }
+
+ return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
+}
+
+type candidateList map[*html.Node]*candidate
+
+func (c candidateList) String() string {
+ var output []string
+ for _, candidate := range c {
+ output = append(output, candidate.String())
+ }
+
+ return strings.Join(output, ", ")
+}
+
+// ExtractContent returns relevant content.
+func ExtractContent(page io.Reader) (string, error) {
+ document, err := goquery.NewDocumentFromReader(page)
+ if err != nil {
+ return "", err
+ }
+
+ document.Find("script,style").Each(func(i int, s *goquery.Selection) {
+ removeNodes(s)
+ })
+
+ transformMisusedDivsIntoParagraphs(document)
+ removeUnlikelyCandidates(document)
+
+ candidates := getCandidates(document)
+ //log.Printf("[Readability] Candidates: %v", candidates)
+
+ topCandidate := getTopCandidate(document, candidates)
+ //log.Printf("[Readability] TopCandidate: %v", topCandidate)
+
+ output := getArticle(topCandidate, candidates)
+ return output, nil
+}
+
+// Now that we have the top candidate, look through its siblings for content that might also be related.
+// Things like preambles, content split by ads that we removed, etc.
+func getArticle(topCandidate *candidate, candidates candidateList) string {
+ output := bytes.NewBufferString("
")
+ siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2)))
+
+ topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
+ append := false
+ node := s.Get(0)
+
+ if node == topCandidate.Node() {
+ append = true
+ } else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold {
+ append = true
+ }
+
+ if s.Is("p") {
+ linkDensity := getLinkDensity(s)
+ content := s.Text()
+ contentLength := len(content)
+
+ if contentLength >= 80 && linkDensity < .25 {
+ append = true
+ } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
+ append = true
+ }
+ }
+
+ if append {
+ tag := "div"
+ if s.Is("p") {
+ tag = node.Data
+ }
+
+ html, _ := s.Html()
+ fmt.Fprintf(output, "<%s>%s%s>", tag, html, tag)
+ }
+ })
+
+ output.Write([]byte("
"))
+ return output.String()
+}
+
+func removeUnlikelyCandidates(document *goquery.Document) {
+ document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) {
+ class, _ := s.Attr("class")
+ id, _ := s.Attr("id")
+ str := class + id
+
+ if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
+ removeNodes(s)
+ }
+ })
+}
+
+func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
+ var best *candidate
+
+ for _, c := range candidates {
+ if best == nil {
+ best = c
+ } else if best.score < c.score {
+ best = c
+ }
+ }
+
+ if best == nil {
+ best = &candidate{document.Find("body"), 0}
+ }
+
+ return best
+}
+
+// Loop through all paragraphs, and assign a score to them based on how content-y they look.
+// Then add their score to their parent node.
+// A score is determined by things like number of commas, class names, etc.
+// Maybe eventually link density.
+func getCandidates(document *goquery.Document) candidateList {
+ candidates := make(candidateList)
+
+ document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
+ text := s.Text()
+
+ // If this paragraph is less than 25 characters, don't even count it.
+ if len(text) < 25 {
+ return
+ }
+
+ parent := s.Parent()
+ parentNode := parent.Get(0)
+
+ grandParent := parent.Parent()
+ var grandParentNode *html.Node
+ if grandParent.Length() > 0 {
+ grandParentNode = grandParent.Get(0)
+ }
+
+ if _, found := candidates[parentNode]; !found {
+ candidates[parentNode] = scoreNode(parent)
+ }
+
+ if grandParentNode != nil {
+ if _, found := candidates[grandParentNode]; !found {
+ candidates[grandParentNode] = scoreNode(grandParent)
+ }
+ }
+
+ // Add a point for the paragraph itself as a base.
+ contentScore := float32(1.0)
+
+ // Add points for any commas within this paragraph.
+ contentScore += float32(strings.Count(text, ",") + 1)
+
+ // For every 100 characters in this paragraph, add another point. Up to 3 points.
+ contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3))
+
+ candidates[parentNode].score += contentScore
+ if grandParentNode != nil {
+ candidates[grandParentNode].score += contentScore / 2.0
+ }
+ })
+
+ // Scale the final candidates score based on link density. Good content
+ // should have a relatively small link density (5% or less) and be mostly
+ // unaffected by this operation
+ for _, candidate := range candidates {
+ candidate.score = candidate.score * (1 - getLinkDensity(candidate.selection))
+ }
+
+ return candidates
+}
+
+func scoreNode(s *goquery.Selection) *candidate {
+ c := &candidate{selection: s, score: 0}
+
+ switch s.Get(0).DataAtom.String() {
+ case "div":
+ c.score += 5
+ case "pre", "td", "blockquote", "img":
+ c.score += 3
+ case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
+ c.score -= 3
+ case "h1", "h2", "h3", "h4", "h5", "h6", "th":
+ c.score -= 5
+ }
+
+ c.score += getClassWeight(s)
+ return c
+}
+
+// Get the density of links as a percentage of the content
+// This is the amount of text that is inside a link divided by the total text in the node.
+func getLinkDensity(s *goquery.Selection) float32 {
+ linkLength := len(s.Find("a").Text())
+ textLength := len(s.Text())
+
+ if textLength == 0 {
+ return 0
+ }
+
+ return float32(linkLength) / float32(textLength)
+}
+
+// Get an elements class/id weight. Uses regular expressions to tell if this
+// element looks good or bad.
+func getClassWeight(s *goquery.Selection) float32 {
+ weight := 0
+ class, _ := s.Attr("class")
+ id, _ := s.Attr("id")
+
+ if class != "" {
+ if negativeRegexp.MatchString(class) {
+ weight -= 25
+ }
+
+ if positiveRegexp.MatchString(class) {
+ weight += 25
+ }
+ }
+
+ if id != "" {
+ if negativeRegexp.MatchString(id) {
+ weight -= 25
+ }
+
+ if positiveRegexp.MatchString(id) {
+ weight += 25
+ }
+ }
+
+ return float32(weight)
+}
+
+func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
+ document.Find("div").Each(func(i int, s *goquery.Selection) {
+ html, _ := s.Html()
+ if !divToPElementsRegexp.MatchString(html) {
+ node := s.Get(0)
+ node.Data = "p"
+ }
+ })
+}
+
+func removeNodes(s *goquery.Selection) {
+ s.Each(func(i int, s *goquery.Selection) {
+ parent := s.Parent()
+ if parent.Length() > 0 {
+ parent.Get(0).RemoveChild(s.Get(0))
+ }
+ })
+}
diff --git a/src/scraper/sanitizer.go b/src/scraper/sanitizer.go
new file mode 100644
index 0000000..8deae25
--- /dev/null
+++ b/src/scraper/sanitizer.go
@@ -0,0 +1,508 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "regexp"
+ "strconv"
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+var (
+ youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)`)
+ splitSrcsetRegex = regexp.MustCompile(`,\s+`)
+)
+
+// Sanitize returns safe HTML.
+func Sanitize(baseURL, input string) string {
+ var buffer bytes.Buffer
+ var tagStack []string
+ var parentTag string
+ blacklistedTagDepth := 0
+
+ tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
+ for {
+ if tokenizer.Next() == html.ErrorToken {
+ err := tokenizer.Err()
+ if err == io.EOF {
+ return buffer.String()
+ }
+
+ return ""
+ }
+
+ token := tokenizer.Token()
+ switch token.Type {
+ case html.TextToken:
+ if blacklistedTagDepth > 0 {
+ continue
+ }
+
+ // An iframe element never has fallback content.
+ // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
+ if parentTag == "iframe" {
+ continue
+ }
+
+ buffer.WriteString(html.EscapeString(token.Data))
+ case html.StartTagToken:
+ tagName := token.DataAtom.String()
+ parentTag = tagName
+
+ if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
+ attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
+
+ if hasRequiredAttributes(tagName, attrNames) {
+ if len(attrNames) > 0 {
+ buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
+ } else {
+ buffer.WriteString("<" + tagName + ">")
+ }
+
+ tagStack = append(tagStack, tagName)
+ }
+ } else if isBlockedTag(tagName) {
+ blacklistedTagDepth++
+ }
+ case html.EndTagToken:
+ tagName := token.DataAtom.String()
+ if isValidTag(tagName) && inList(tagName, tagStack) {
+ buffer.WriteString(fmt.Sprintf("%s>", tagName))
+ } else if isBlockedTag(tagName) {
+ blacklistedTagDepth--
+ }
+ case html.SelfClosingTagToken:
+ tagName := token.DataAtom.String()
+ if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
+ attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
+
+ if hasRequiredAttributes(tagName, attrNames) {
+ if len(attrNames) > 0 {
+ buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
+ } else {
+ buffer.WriteString("<" + tagName + "/>")
+ }
+ }
+ }
+ }
+ }
+}
+
+func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
+ var htmlAttrs, attrNames []string
+
+ for _, attribute := range attributes {
+ value := attribute.Val
+
+ if !isValidAttribute(tagName, attribute.Key) {
+ continue
+ }
+
+ if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
+ value = sanitizeSrcsetAttr(baseURL, value)
+ }
+
+ if isExternalResourceAttribute(attribute.Key) {
+ if tagName == "iframe" {
+ if isValidIframeSource(baseURL, attribute.Val) {
+ value = rewriteIframeURL(attribute.Val)
+ } else {
+ continue
+ }
+ } else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
+ value = attribute.Val
+ } else {
+ value = absoluteUrl(value, baseURL)
+ if value == "" {
+ continue
+ }
+
+ if !hasValidURIScheme(value) || isBlockedResource(value) {
+ continue
+ }
+ }
+ }
+
+ attrNames = append(attrNames, attribute.Key)
+ htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)))
+ }
+
+ extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
+ if len(extraAttrNames) > 0 {
+ attrNames = append(attrNames, extraAttrNames...)
+ htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
+ }
+
+ return attrNames, strings.Join(htmlAttrs, " ")
+}
+
+func getExtraAttributes(tagName string) ([]string, []string) {
+ switch tagName {
+ case "a":
+ return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
+ case "video", "audio":
+ return []string{"controls"}, []string{"controls"}
+ case "iframe":
+ return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups"`, `loading="lazy"`}
+ case "img":
+ return []string{"loading"}, []string{`loading="lazy"`}
+ default:
+ return nil, nil
+ }
+}
+
+func isValidTag(tagName string) bool {
+ for element := range getTagAllowList() {
+ if tagName == element {
+ return true
+ }
+ }
+
+ return false
+}
+
+func isValidAttribute(tagName, attributeName string) bool {
+ for element, attributes := range getTagAllowList() {
+ if tagName == element {
+ if inList(attributeName, attributes) {
+ return true
+ }
+ }
+ }
+
+ return false
+}
+
+func isExternalResourceAttribute(attribute string) bool {
+ switch attribute {
+ case "src", "href", "poster", "cite":
+ return true
+ default:
+ return false
+ }
+}
+
+func isPixelTracker(tagName string, attributes []html.Attribute) bool {
+ if tagName == "img" {
+ hasHeight := false
+ hasWidth := false
+
+ for _, attribute := range attributes {
+ if attribute.Key == "height" && attribute.Val == "1" {
+ hasHeight = true
+ }
+
+ if attribute.Key == "width" && attribute.Val == "1" {
+ hasWidth = true
+ }
+ }
+
+ return hasHeight && hasWidth
+ }
+
+ return false
+}
+
+func hasRequiredAttributes(tagName string, attributes []string) bool {
+ elements := make(map[string][]string)
+ elements["a"] = []string{"href"}
+ elements["iframe"] = []string{"src"}
+ elements["img"] = []string{"src"}
+ elements["source"] = []string{"src", "srcset"}
+
+ for element, attrs := range elements {
+ if tagName == element {
+ for _, attribute := range attributes {
+ for _, attr := range attrs {
+ if attr == attribute {
+ return true
+ }
+ }
+ }
+
+ return false
+ }
+ }
+
+ return true
+}
+
+// See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+func hasValidURIScheme(src string) bool {
+ whitelist := []string{
+ "apt:",
+ "bitcoin:",
+ "callto:",
+ "dav:",
+ "davs:",
+ "ed2k://",
+ "facetime://",
+ "feed:",
+ "ftp://",
+ "geo:",
+ "gopher://",
+ "git://",
+ "http://",
+ "https://",
+ "irc://",
+ "irc6://",
+ "ircs://",
+ "itms://",
+ "itms-apps://",
+ "magnet:",
+ "mailto:",
+ "news:",
+ "nntp:",
+ "rtmp://",
+ "sip:",
+ "sips:",
+ "skype:",
+ "spotify:",
+ "ssh://",
+ "sftp://",
+ "steam://",
+ "svn://",
+ "svn+ssh://",
+ "tel:",
+ "webcal://",
+ "xmpp:",
+ }
+
+ for _, prefix := range whitelist {
+ if strings.HasPrefix(src, prefix) {
+ return true
+ }
+ }
+
+ return false
+}
+
+func isBlockedResource(src string) bool {
+ blacklist := []string{
+ "feedsportal.com",
+ "api.flattr.com",
+ "stats.wordpress.com",
+ "plus.google.com/share",
+ "twitter.com/share",
+ "feeds.feedburner.com",
+ }
+
+ for _, element := range blacklist {
+ if strings.Contains(src, element) {
+ return true
+ }
+ }
+
+ return false
+}
+
+func isValidIframeSource(baseURL, src string) bool {
+ whitelist := []string{
+ "https://invidio.us",
+ "//www.youtube.com",
+ "http://www.youtube.com",
+ "https://www.youtube.com",
+ "https://www.youtube-nocookie.com",
+ "http://player.vimeo.com",
+ "https://player.vimeo.com",
+ "http://www.dailymotion.com",
+ "https://www.dailymotion.com",
+ "http://vk.com",
+ "https://vk.com",
+ "http://soundcloud.com",
+ "https://soundcloud.com",
+ "http://w.soundcloud.com",
+ "https://w.soundcloud.com",
+ "http://bandcamp.com",
+ "https://bandcamp.com",
+ "https://cdn.embedly.com",
+ "https://player.bilibili.com",
+ }
+
+ // allow iframe from same origin
+ if urlDomain(baseURL) == urlDomain(src) {
+ return true
+ }
+
+ for _, prefix := range whitelist {
+ if strings.HasPrefix(src, prefix) {
+ return true
+ }
+ }
+
+ return false
+}
+
+func getTagAllowList() map[string][]string {
+ whitelist := make(map[string][]string)
+ whitelist["img"] = []string{"alt", "title", "src", "srcset", "sizes"}
+ whitelist["picture"] = []string{}
+ whitelist["audio"] = []string{"src"}
+ whitelist["video"] = []string{"poster", "height", "width", "src"}
+ whitelist["source"] = []string{"src", "type", "srcset", "sizes", "media"}
+ whitelist["dt"] = []string{}
+ whitelist["dd"] = []string{}
+ whitelist["dl"] = []string{}
+ whitelist["table"] = []string{}
+ whitelist["caption"] = []string{}
+ whitelist["thead"] = []string{}
+ whitelist["tfooter"] = []string{}
+ whitelist["tr"] = []string{}
+ whitelist["td"] = []string{"rowspan", "colspan"}
+ whitelist["th"] = []string{"rowspan", "colspan"}
+ whitelist["h1"] = []string{}
+ whitelist["h2"] = []string{}
+ whitelist["h3"] = []string{}
+ whitelist["h4"] = []string{}
+ whitelist["h5"] = []string{}
+ whitelist["h6"] = []string{}
+ whitelist["strong"] = []string{}
+ whitelist["em"] = []string{}
+ whitelist["code"] = []string{}
+ whitelist["pre"] = []string{}
+ whitelist["blockquote"] = []string{}
+ whitelist["q"] = []string{"cite"}
+ whitelist["p"] = []string{}
+ whitelist["ul"] = []string{}
+ whitelist["li"] = []string{}
+ whitelist["ol"] = []string{}
+ whitelist["br"] = []string{}
+ whitelist["del"] = []string{}
+ whitelist["a"] = []string{"href", "title"}
+ whitelist["figure"] = []string{}
+ whitelist["figcaption"] = []string{}
+ whitelist["cite"] = []string{}
+ whitelist["time"] = []string{"datetime"}
+ whitelist["abbr"] = []string{"title"}
+ whitelist["acronym"] = []string{"title"}
+ whitelist["wbr"] = []string{}
+ whitelist["dfn"] = []string{}
+ whitelist["sub"] = []string{}
+ whitelist["sup"] = []string{}
+ whitelist["var"] = []string{}
+ whitelist["samp"] = []string{}
+ whitelist["s"] = []string{}
+ whitelist["del"] = []string{}
+ whitelist["ins"] = []string{}
+ whitelist["kbd"] = []string{}
+ whitelist["rp"] = []string{}
+ whitelist["rt"] = []string{}
+ whitelist["rtc"] = []string{}
+ whitelist["ruby"] = []string{}
+ whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"}
+ return whitelist
+}
+
+func inList(needle string, haystack []string) bool {
+ for _, element := range haystack {
+ if element == needle {
+ return true
+ }
+ }
+
+ return false
+}
+
+func rewriteIframeURL(link string) string {
+ matches := youtubeEmbedRegex.FindStringSubmatch(link)
+ if len(matches) == 2 {
+ return `https://www.youtube-nocookie.com/embed/` + matches[1]
+ }
+
+ return link
+}
+
+func isBlockedTag(tagName string) bool {
+ blacklist := []string{
+ "noscript",
+ "script",
+ "style",
+ }
+
+ for _, element := range blacklist {
+ if element == tagName {
+ return true
+ }
+ }
+
+ return false
+}
+
+/*
+
+One or more strings separated by commas, indicating possible image sources for the user agent to use.
+
+Each string is composed of:
+- A URL to an image
+- Optionally, whitespace followed by one of:
+- A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density.
+- A pixel density descriptor (a positive floating point number directly followed by x).
+
+*/
+func sanitizeSrcsetAttr(baseURL, value string) string {
+ var sanitizedSources []string
+ rawSources := splitSrcsetRegex.Split(value, -1)
+ for _, rawSource := range rawSources {
+ parts := strings.Split(strings.TrimSpace(rawSource), " ")
+ nbParts := len(parts)
+
+ if nbParts > 0 {
+ sanitizedSource := parts[0]
+ if !strings.HasPrefix(parts[0], "data:") {
+ sanitizedSource = absoluteUrl(parts[0], baseURL)
+ if sanitizedSource == "" {
+ continue
+ }
+ }
+
+ if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) {
+ sanitizedSource += " " + parts[1]
+ }
+
+ sanitizedSources = append(sanitizedSources, sanitizedSource)
+ }
+ }
+ return strings.Join(sanitizedSources, ", ")
+}
+
+func isValidWidthOrDensityDescriptor(value string) bool {
+ if value == "" {
+ return false
+ }
+
+ lastChar := value[len(value)-1:]
+ if lastChar != "w" && lastChar != "x" {
+ return false
+ }
+
+ _, err := strconv.ParseFloat(value[0:len(value)-1], 32)
+ return err == nil
+}
+
+func isValidDataAttribute(value string) bool {
+ var dataAttributeAllowList = []string{
+ "data:image/avif",
+ "data:image/apng",
+ "data:image/png",
+ "data:image/svg",
+ "data:image/svg+xml",
+ "data:image/jpg",
+ "data:image/jpeg",
+ "data:image/gif",
+ "data:image/webp",
+ }
+
+ for _, prefix := range dataAttributeAllowList {
+ if strings.HasPrefix(value, prefix) {
+ return true
+ }
+ }
+ return false
+}
diff --git a/src/scraper/sanitizer_test.go b/src/scraper/sanitizer_test.go
new file mode 100644
index 0000000..e772f87
--- /dev/null
+++ b/src/scraper/sanitizer_test.go
@@ -0,0 +1,552 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+import "testing"
+
+func TestValidInput(t *testing.T) {
+ input := `This is a text with an image:
.
`
+ output := Sanitize("http://example.org/", input)
+
+ if input != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
+ }
+}
+
+func TestImgWithTextDataURL(t *testing.T) {
+ input := `
`
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if output != expected {
+ t.Errorf(`Wrong output: %s`, output)
+ }
+}
+
+func TestImgWithDataURL(t *testing.T) {
+ input := `
`
+ expected := `
`
+ output := Sanitize("http://example.org/", input)
+
+ if output != expected {
+ t.Errorf(`Wrong output: %s`, output)
+ }
+}
+
+func TestImgWithSrcset(t *testing.T) {
+ input := `
`
+ expected := `
`
+ output := Sanitize("http://example.org/", input)
+
+ if output != expected {
+ t.Errorf(`Wrong output: %s`, output)
+ }
+}
+
+func TestImgWithSrcsetAndDataURL(t *testing.T) {
+ input := `
`
+ expected := `
`
+ output := Sanitize("http://example.org/", input)
+
+ if output != expected {
+ t.Errorf(`Wrong output: %s`, output)
+ }
+}
+
+func TestSourceWithSrcsetAndMedia(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if output != expected {
+ t.Errorf(`Wrong output: %s`, output)
+ }
+}
+
+func TestMediumImgWithSrcset(t *testing.T) {
+ input := `
`
+ expected := `
`
+ output := Sanitize("http://example.org/", input)
+
+ if output != expected {
+ t.Errorf(`Wrong output: %s`, output)
+ }
+}
+
+func TestSelfClosingTags(t *testing.T) {
+ input := `This
is a text
with an image:
.
`
+ output := Sanitize("http://example.org/", input)
+
+ if input != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
+ }
+}
+
+func TestTable(t *testing.T) {
+ input := ``
+ output := Sanitize("http://example.org/", input)
+
+ if input != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, input, output)
+ }
+}
+
+func TestRelativeURL(t *testing.T) {
+ input := `This link is relative and this image:
`
+ expected := `This link is relative and this image:
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestProtocolRelativeURL(t *testing.T) {
+ input := `This link is relative.`
+ expected := `This link is relative.`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestInvalidTag(t *testing.T) {
+ input := `My invalid tag.
`
+ expected := `My invalid tag.
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestVideoTag(t *testing.T) {
+ input := `My valid .
`
+ expected := `My valid .
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestAudioAndSourceTag(t *testing.T) {
+ input := `My music .
`
+ expected := `My music .
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestUnknownTag(t *testing.T) {
+ input := `My invalid tag.
`
+ expected := `My invalid tag.
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestInvalidNestedTag(t *testing.T) {
+ input := `My invalid tag with some valid tag.
`
+ expected := `My invalid tag with some valid tag.
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestInvalidIFrame(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.com/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestIFrameWithChildElements(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.com/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestInvalidURLScheme(t *testing.T) {
+ input := `This link is not valid
`
+ expected := `This link is not valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestAPTURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestBitcoinURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestCallToURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestFeedURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+
+ input = `This link is valid
`
+ expected = `This link is valid
`
+ output = Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestGeoURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestItunesURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+
+ input = `This link is valid
`
+ expected = `This link is valid
`
+ output = Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestMagnetURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestMailtoURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestNewsURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+
+ input = `This link is valid
`
+ expected = `This link is valid
`
+ output = Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+
+ input = `This link is valid
`
+ expected = `This link is valid
`
+ output = Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestRTMPURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestSIPURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+
+ input = `This link is valid
`
+ expected = `This link is valid
`
+ output = Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestSkypeURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestSpotifyURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestSteamURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestSubversionURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+
+ input = `This link is valid
`
+ expected = `This link is valid
`
+ output = Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestTelURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestWebcalURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestXMPPURIScheme(t *testing.T) {
+ input := `This link is valid
`
+ expected := `This link is valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestBlacklistedLink(t *testing.T) {
+ input := `This image is not valid 
`
+ expected := `This image is not valid
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestPixelTracker(t *testing.T) {
+ input := `
and 
`
+ expected := ` and
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestXmlEntities(t *testing.T) {
+ input := `echo "test" > /etc/hosts
`
+ expected := `echo "test" > /etc/hosts
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestEspaceAttributes(t *testing.T) {
+ input := `test | `
+ expected := `test | `
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceYoutubeURL(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceSecureYoutubeURL(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceSecureYoutubeURLWithParameters(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceYoutubeURLAlreadyReplaced(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceProtocolRelativeYoutubeURL(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceIframeURL(t *testing.T) {
+ input := ``
+ expected := ``
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceNoScript(t *testing.T) {
+ input := `Before paragraph.
After paragraph.
`
+ expected := `Before paragraph.
After paragraph.
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceScript(t *testing.T) {
+ input := `Before paragraph.
After paragraph.
`
+ expected := `Before paragraph.
After paragraph.
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
+
+func TestReplaceStyle(t *testing.T) {
+ input := `Before paragraph.
After paragraph.
`
+ expected := `Before paragraph.
After paragraph.
`
+ output := Sanitize("http://example.org/", input)
+
+ if expected != output {
+ t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+ }
+}
diff --git a/src/scraper/utils.go b/src/scraper/utils.go
index dba54b3..3a745ea 100644
--- a/src/scraper/utils.go
+++ b/src/scraper/utils.go
@@ -65,3 +65,10 @@ func absoluteUrl(href, base string) string {
}
return baseUrl.ResolveReference(hrefUrl).String()
}
+
+func urlDomain(val string) string {
+ if u, err := url.Parse(val); err == nil {
+ return u.Host
+ }
+ return val
+}