From a83d43a5b17d30089c7f45d751f071ac9b13e443 Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Mon, 29 Mar 2021 13:58:47 +0100 Subject: [PATCH] borrow miniflux code --- go.mod | 1 + go.sum | 2 + src/scraper/readability.go | 305 +++++++++++++++++++ src/scraper/sanitizer.go | 508 +++++++++++++++++++++++++++++++ src/scraper/sanitizer_test.go | 552 ++++++++++++++++++++++++++++++++++ src/scraper/utils.go | 7 + 6 files changed, 1375 insertions(+) create mode 100644 src/scraper/readability.go create mode 100644 src/scraper/sanitizer.go create mode 100644 src/scraper/sanitizer_test.go diff --git a/go.mod b/go.mod index c55fecf..cbc46b2 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/nkanaev/yarr go 1.16 require ( + github.com/PuerkitoBio/goquery v1.5.1 github.com/mattn/go-sqlite3 v1.14.0 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13 diff --git a/go.sum b/go.sum index 054e8f1..02859e5 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,6 @@ +github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA= github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus= diff --git a/src/scraper/readability.go b/src/scraper/readability.go new file mode 100644 index 0000000..581c8af --- /dev/null +++ b/src/scraper/readability.go @@ -0,0 +1,305 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package scraper + +import ( + "bytes" + "fmt" + "io" + //"log" + "math" + "regexp" + "strings" + + "github.com/PuerkitoBio/goquery" + "golang.org/x/net/html" +) + +const ( + defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div" +) + +var ( + divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) + sentenceRegexp = regexp.MustCompile(`\.( |$)`) + + blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`) + okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`) + unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) + + negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`) + positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) +) + +type candidate struct { + selection *goquery.Selection + score float32 +} + +func (c *candidate) Node() *html.Node { + return c.selection.Get(0) +} + +func (c *candidate) String() string { + id, _ := c.selection.Attr("id") + class, _ := c.selection.Attr("class") + + if id != "" && class != "" { + return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score) + } else if id != "" { + return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score) + } else if class != "" { + return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score) + } + + return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score) +} + +type candidateList map[*html.Node]*candidate + +func (c candidateList) String() string { + var output []string + for _, candidate := range c { + output = append(output, candidate.String()) + } + + return strings.Join(output, ", ") +} + +// ExtractContent returns relevant content. +func ExtractContent(page io.Reader) (string, error) { + document, err := goquery.NewDocumentFromReader(page) + if err != nil { + return "", err + } + + document.Find("script,style").Each(func(i int, s *goquery.Selection) { + removeNodes(s) + }) + + transformMisusedDivsIntoParagraphs(document) + removeUnlikelyCandidates(document) + + candidates := getCandidates(document) + //log.Printf("[Readability] Candidates: %v", candidates) + + topCandidate := getTopCandidate(document, candidates) + //log.Printf("[Readability] TopCandidate: %v", topCandidate) + + output := getArticle(topCandidate, candidates) + return output, nil +} + +// Now that we have the top candidate, look through its siblings for content that might also be related. +// Things like preambles, content split by ads that we removed, etc. +func getArticle(topCandidate *candidate, candidates candidateList) string { + output := bytes.NewBufferString("
") + siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2))) + + topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) { + append := false + node := s.Get(0) + + if node == topCandidate.Node() { + append = true + } else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold { + append = true + } + + if s.Is("p") { + linkDensity := getLinkDensity(s) + content := s.Text() + contentLength := len(content) + + if contentLength >= 80 && linkDensity < .25 { + append = true + } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) { + append = true + } + } + + if append { + tag := "div" + if s.Is("p") { + tag = node.Data + } + + html, _ := s.Html() + fmt.Fprintf(output, "<%s>%s", tag, html, tag) + } + }) + + output.Write([]byte("
")) + return output.String() +} + +func removeUnlikelyCandidates(document *goquery.Document) { + document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) { + class, _ := s.Attr("class") + id, _ := s.Attr("id") + str := class + id + + if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) { + removeNodes(s) + } + }) +} + +func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate { + var best *candidate + + for _, c := range candidates { + if best == nil { + best = c + } else if best.score < c.score { + best = c + } + } + + if best == nil { + best = &candidate{document.Find("body"), 0} + } + + return best +} + +// Loop through all paragraphs, and assign a score to them based on how content-y they look. +// Then add their score to their parent node. +// A score is determined by things like number of commas, class names, etc. +// Maybe eventually link density. +func getCandidates(document *goquery.Document) candidateList { + candidates := make(candidateList) + + document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) { + text := s.Text() + + // If this paragraph is less than 25 characters, don't even count it. + if len(text) < 25 { + return + } + + parent := s.Parent() + parentNode := parent.Get(0) + + grandParent := parent.Parent() + var grandParentNode *html.Node + if grandParent.Length() > 0 { + grandParentNode = grandParent.Get(0) + } + + if _, found := candidates[parentNode]; !found { + candidates[parentNode] = scoreNode(parent) + } + + if grandParentNode != nil { + if _, found := candidates[grandParentNode]; !found { + candidates[grandParentNode] = scoreNode(grandParent) + } + } + + // Add a point for the paragraph itself as a base. + contentScore := float32(1.0) + + // Add points for any commas within this paragraph. + contentScore += float32(strings.Count(text, ",") + 1) + + // For every 100 characters in this paragraph, add another point. Up to 3 points. + contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3)) + + candidates[parentNode].score += contentScore + if grandParentNode != nil { + candidates[grandParentNode].score += contentScore / 2.0 + } + }) + + // Scale the final candidates score based on link density. Good content + // should have a relatively small link density (5% or less) and be mostly + // unaffected by this operation + for _, candidate := range candidates { + candidate.score = candidate.score * (1 - getLinkDensity(candidate.selection)) + } + + return candidates +} + +func scoreNode(s *goquery.Selection) *candidate { + c := &candidate{selection: s, score: 0} + + switch s.Get(0).DataAtom.String() { + case "div": + c.score += 5 + case "pre", "td", "blockquote", "img": + c.score += 3 + case "address", "ol", "ul", "dl", "dd", "dt", "li", "form": + c.score -= 3 + case "h1", "h2", "h3", "h4", "h5", "h6", "th": + c.score -= 5 + } + + c.score += getClassWeight(s) + return c +} + +// Get the density of links as a percentage of the content +// This is the amount of text that is inside a link divided by the total text in the node. +func getLinkDensity(s *goquery.Selection) float32 { + linkLength := len(s.Find("a").Text()) + textLength := len(s.Text()) + + if textLength == 0 { + return 0 + } + + return float32(linkLength) / float32(textLength) +} + +// Get an elements class/id weight. Uses regular expressions to tell if this +// element looks good or bad. +func getClassWeight(s *goquery.Selection) float32 { + weight := 0 + class, _ := s.Attr("class") + id, _ := s.Attr("id") + + if class != "" { + if negativeRegexp.MatchString(class) { + weight -= 25 + } + + if positiveRegexp.MatchString(class) { + weight += 25 + } + } + + if id != "" { + if negativeRegexp.MatchString(id) { + weight -= 25 + } + + if positiveRegexp.MatchString(id) { + weight += 25 + } + } + + return float32(weight) +} + +func transformMisusedDivsIntoParagraphs(document *goquery.Document) { + document.Find("div").Each(func(i int, s *goquery.Selection) { + html, _ := s.Html() + if !divToPElementsRegexp.MatchString(html) { + node := s.Get(0) + node.Data = "p" + } + }) +} + +func removeNodes(s *goquery.Selection) { + s.Each(func(i int, s *goquery.Selection) { + parent := s.Parent() + if parent.Length() > 0 { + parent.Get(0).RemoveChild(s.Get(0)) + } + }) +} diff --git a/src/scraper/sanitizer.go b/src/scraper/sanitizer.go new file mode 100644 index 0000000..8deae25 --- /dev/null +++ b/src/scraper/sanitizer.go @@ -0,0 +1,508 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package scraper + +import ( + "bytes" + "fmt" + "io" + "regexp" + "strconv" + "strings" + + "golang.org/x/net/html" +) + +var ( + youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)`) + splitSrcsetRegex = regexp.MustCompile(`,\s+`) +) + +// Sanitize returns safe HTML. +func Sanitize(baseURL, input string) string { + var buffer bytes.Buffer + var tagStack []string + var parentTag string + blacklistedTagDepth := 0 + + tokenizer := html.NewTokenizer(bytes.NewBufferString(input)) + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + return buffer.String() + } + + return "" + } + + token := tokenizer.Token() + switch token.Type { + case html.TextToken: + if blacklistedTagDepth > 0 { + continue + } + + // An iframe element never has fallback content. + // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element + if parentTag == "iframe" { + continue + } + + buffer.WriteString(html.EscapeString(token.Data)) + case html.StartTagToken: + tagName := token.DataAtom.String() + parentTag = tagName + + if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) { + attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) + + if hasRequiredAttributes(tagName, attrNames) { + if len(attrNames) > 0 { + buffer.WriteString("<" + tagName + " " + htmlAttributes + ">") + } else { + buffer.WriteString("<" + tagName + ">") + } + + tagStack = append(tagStack, tagName) + } + } else if isBlockedTag(tagName) { + blacklistedTagDepth++ + } + case html.EndTagToken: + tagName := token.DataAtom.String() + if isValidTag(tagName) && inList(tagName, tagStack) { + buffer.WriteString(fmt.Sprintf("", tagName)) + } else if isBlockedTag(tagName) { + blacklistedTagDepth-- + } + case html.SelfClosingTagToken: + tagName := token.DataAtom.String() + if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) { + attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) + + if hasRequiredAttributes(tagName, attrNames) { + if len(attrNames) > 0 { + buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>") + } else { + buffer.WriteString("<" + tagName + "/>") + } + } + } + } + } +} + +func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) { + var htmlAttrs, attrNames []string + + for _, attribute := range attributes { + value := attribute.Val + + if !isValidAttribute(tagName, attribute.Key) { + continue + } + + if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" { + value = sanitizeSrcsetAttr(baseURL, value) + } + + if isExternalResourceAttribute(attribute.Key) { + if tagName == "iframe" { + if isValidIframeSource(baseURL, attribute.Val) { + value = rewriteIframeURL(attribute.Val) + } else { + continue + } + } else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) { + value = attribute.Val + } else { + value = absoluteUrl(value, baseURL) + if value == "" { + continue + } + + if !hasValidURIScheme(value) || isBlockedResource(value) { + continue + } + } + } + + attrNames = append(attrNames, attribute.Key) + htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value))) + } + + extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName) + if len(extraAttrNames) > 0 { + attrNames = append(attrNames, extraAttrNames...) + htmlAttrs = append(htmlAttrs, extraHTMLAttributes...) + } + + return attrNames, strings.Join(htmlAttrs, " ") +} + +func getExtraAttributes(tagName string) ([]string, []string) { + switch tagName { + case "a": + return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`} + case "video", "audio": + return []string{"controls"}, []string{"controls"} + case "iframe": + return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups"`, `loading="lazy"`} + case "img": + return []string{"loading"}, []string{`loading="lazy"`} + default: + return nil, nil + } +} + +func isValidTag(tagName string) bool { + for element := range getTagAllowList() { + if tagName == element { + return true + } + } + + return false +} + +func isValidAttribute(tagName, attributeName string) bool { + for element, attributes := range getTagAllowList() { + if tagName == element { + if inList(attributeName, attributes) { + return true + } + } + } + + return false +} + +func isExternalResourceAttribute(attribute string) bool { + switch attribute { + case "src", "href", "poster", "cite": + return true + default: + return false + } +} + +func isPixelTracker(tagName string, attributes []html.Attribute) bool { + if tagName == "img" { + hasHeight := false + hasWidth := false + + for _, attribute := range attributes { + if attribute.Key == "height" && attribute.Val == "1" { + hasHeight = true + } + + if attribute.Key == "width" && attribute.Val == "1" { + hasWidth = true + } + } + + return hasHeight && hasWidth + } + + return false +} + +func hasRequiredAttributes(tagName string, attributes []string) bool { + elements := make(map[string][]string) + elements["a"] = []string{"href"} + elements["iframe"] = []string{"src"} + elements["img"] = []string{"src"} + elements["source"] = []string{"src", "srcset"} + + for element, attrs := range elements { + if tagName == element { + for _, attribute := range attributes { + for _, attr := range attrs { + if attr == attribute { + return true + } + } + } + + return false + } + } + + return true +} + +// See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml +func hasValidURIScheme(src string) bool { + whitelist := []string{ + "apt:", + "bitcoin:", + "callto:", + "dav:", + "davs:", + "ed2k://", + "facetime://", + "feed:", + "ftp://", + "geo:", + "gopher://", + "git://", + "http://", + "https://", + "irc://", + "irc6://", + "ircs://", + "itms://", + "itms-apps://", + "magnet:", + "mailto:", + "news:", + "nntp:", + "rtmp://", + "sip:", + "sips:", + "skype:", + "spotify:", + "ssh://", + "sftp://", + "steam://", + "svn://", + "svn+ssh://", + "tel:", + "webcal://", + "xmpp:", + } + + for _, prefix := range whitelist { + if strings.HasPrefix(src, prefix) { + return true + } + } + + return false +} + +func isBlockedResource(src string) bool { + blacklist := []string{ + "feedsportal.com", + "api.flattr.com", + "stats.wordpress.com", + "plus.google.com/share", + "twitter.com/share", + "feeds.feedburner.com", + } + + for _, element := range blacklist { + if strings.Contains(src, element) { + return true + } + } + + return false +} + +func isValidIframeSource(baseURL, src string) bool { + whitelist := []string{ + "https://invidio.us", + "//www.youtube.com", + "http://www.youtube.com", + "https://www.youtube.com", + "https://www.youtube-nocookie.com", + "http://player.vimeo.com", + "https://player.vimeo.com", + "http://www.dailymotion.com", + "https://www.dailymotion.com", + "http://vk.com", + "https://vk.com", + "http://soundcloud.com", + "https://soundcloud.com", + "http://w.soundcloud.com", + "https://w.soundcloud.com", + "http://bandcamp.com", + "https://bandcamp.com", + "https://cdn.embedly.com", + "https://player.bilibili.com", + } + + // allow iframe from same origin + if urlDomain(baseURL) == urlDomain(src) { + return true + } + + for _, prefix := range whitelist { + if strings.HasPrefix(src, prefix) { + return true + } + } + + return false +} + +func getTagAllowList() map[string][]string { + whitelist := make(map[string][]string) + whitelist["img"] = []string{"alt", "title", "src", "srcset", "sizes"} + whitelist["picture"] = []string{} + whitelist["audio"] = []string{"src"} + whitelist["video"] = []string{"poster", "height", "width", "src"} + whitelist["source"] = []string{"src", "type", "srcset", "sizes", "media"} + whitelist["dt"] = []string{} + whitelist["dd"] = []string{} + whitelist["dl"] = []string{} + whitelist["table"] = []string{} + whitelist["caption"] = []string{} + whitelist["thead"] = []string{} + whitelist["tfooter"] = []string{} + whitelist["tr"] = []string{} + whitelist["td"] = []string{"rowspan", "colspan"} + whitelist["th"] = []string{"rowspan", "colspan"} + whitelist["h1"] = []string{} + whitelist["h2"] = []string{} + whitelist["h3"] = []string{} + whitelist["h4"] = []string{} + whitelist["h5"] = []string{} + whitelist["h6"] = []string{} + whitelist["strong"] = []string{} + whitelist["em"] = []string{} + whitelist["code"] = []string{} + whitelist["pre"] = []string{} + whitelist["blockquote"] = []string{} + whitelist["q"] = []string{"cite"} + whitelist["p"] = []string{} + whitelist["ul"] = []string{} + whitelist["li"] = []string{} + whitelist["ol"] = []string{} + whitelist["br"] = []string{} + whitelist["del"] = []string{} + whitelist["a"] = []string{"href", "title"} + whitelist["figure"] = []string{} + whitelist["figcaption"] = []string{} + whitelist["cite"] = []string{} + whitelist["time"] = []string{"datetime"} + whitelist["abbr"] = []string{"title"} + whitelist["acronym"] = []string{"title"} + whitelist["wbr"] = []string{} + whitelist["dfn"] = []string{} + whitelist["sub"] = []string{} + whitelist["sup"] = []string{} + whitelist["var"] = []string{} + whitelist["samp"] = []string{} + whitelist["s"] = []string{} + whitelist["del"] = []string{} + whitelist["ins"] = []string{} + whitelist["kbd"] = []string{} + whitelist["rp"] = []string{} + whitelist["rt"] = []string{} + whitelist["rtc"] = []string{} + whitelist["ruby"] = []string{} + whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"} + return whitelist +} + +func inList(needle string, haystack []string) bool { + for _, element := range haystack { + if element == needle { + return true + } + } + + return false +} + +func rewriteIframeURL(link string) string { + matches := youtubeEmbedRegex.FindStringSubmatch(link) + if len(matches) == 2 { + return `https://www.youtube-nocookie.com/embed/` + matches[1] + } + + return link +} + +func isBlockedTag(tagName string) bool { + blacklist := []string{ + "noscript", + "script", + "style", + } + + for _, element := range blacklist { + if element == tagName { + return true + } + } + + return false +} + +/* + +One or more strings separated by commas, indicating possible image sources for the user agent to use. + +Each string is composed of: +- A URL to an image +- Optionally, whitespace followed by one of: +- A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density. +- A pixel density descriptor (a positive floating point number directly followed by x). + +*/ +func sanitizeSrcsetAttr(baseURL, value string) string { + var sanitizedSources []string + rawSources := splitSrcsetRegex.Split(value, -1) + for _, rawSource := range rawSources { + parts := strings.Split(strings.TrimSpace(rawSource), " ") + nbParts := len(parts) + + if nbParts > 0 { + sanitizedSource := parts[0] + if !strings.HasPrefix(parts[0], "data:") { + sanitizedSource = absoluteUrl(parts[0], baseURL) + if sanitizedSource == "" { + continue + } + } + + if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) { + sanitizedSource += " " + parts[1] + } + + sanitizedSources = append(sanitizedSources, sanitizedSource) + } + } + return strings.Join(sanitizedSources, ", ") +} + +func isValidWidthOrDensityDescriptor(value string) bool { + if value == "" { + return false + } + + lastChar := value[len(value)-1:] + if lastChar != "w" && lastChar != "x" { + return false + } + + _, err := strconv.ParseFloat(value[0:len(value)-1], 32) + return err == nil +} + +func isValidDataAttribute(value string) bool { + var dataAttributeAllowList = []string{ + "data:image/avif", + "data:image/apng", + "data:image/png", + "data:image/svg", + "data:image/svg+xml", + "data:image/jpg", + "data:image/jpeg", + "data:image/gif", + "data:image/webp", + } + + for _, prefix := range dataAttributeAllowList { + if strings.HasPrefix(value, prefix) { + return true + } + } + return false +} diff --git a/src/scraper/sanitizer_test.go b/src/scraper/sanitizer_test.go new file mode 100644 index 0000000..e772f87 --- /dev/null +++ b/src/scraper/sanitizer_test.go @@ -0,0 +1,552 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package scraper + +import "testing" + +func TestValidInput(t *testing.T) { + input := `

This is a text with an image: Test.

` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestImgWithTextDataURL(t *testing.T) { + input := `Example` + expected := `` + output := Sanitize("http://example.org/", input) + + if output != expected { + t.Errorf(`Wrong output: %s`, output) + } +} + +func TestImgWithDataURL(t *testing.T) { + input := `Example` + expected := `Example` + output := Sanitize("http://example.org/", input) + + if output != expected { + t.Errorf(`Wrong output: %s`, output) + } +} + +func TestImgWithSrcset(t *testing.T) { + input := `Example` + expected := `Example` + output := Sanitize("http://example.org/", input) + + if output != expected { + t.Errorf(`Wrong output: %s`, output) + } +} + +func TestImgWithSrcsetAndDataURL(t *testing.T) { + input := `Example` + expected := `Example` + output := Sanitize("http://example.org/", input) + + if output != expected { + t.Errorf(`Wrong output: %s`, output) + } +} + +func TestSourceWithSrcsetAndMedia(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.org/", input) + + if output != expected { + t.Errorf(`Wrong output: %s`, output) + } +} + +func TestMediumImgWithSrcset(t *testing.T) { + input := `Image for post` + expected := `Image for post` + output := Sanitize("http://example.org/", input) + + if output != expected { + t.Errorf(`Wrong output: %s`, output) + } +} + +func TestSelfClosingTags(t *testing.T) { + input := `

This
is a text
with an image: Test.

` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestTable(t *testing.T) { + input := `
AB
CDE
` + output := Sanitize("http://example.org/", input) + + if input != output { + t.Errorf(`Wrong output: "%s" != "%s"`, input, output) + } +} + +func TestRelativeURL(t *testing.T) { + input := `This link is relative and this image: ` + expected := `This link is relative and this image: ` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestProtocolRelativeURL(t *testing.T) { + input := `This link is relative.` + expected := `This link is relative.` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidTag(t *testing.T) { + input := `

My invalid tag.

` + expected := `

My invalid tag.

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestVideoTag(t *testing.T) { + input := `

My valid .

` + expected := `

My valid .

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestAudioAndSourceTag(t *testing.T) { + input := `

My music .

` + expected := `

My music .

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestUnknownTag(t *testing.T) { + input := `

My invalid tag.

` + expected := `

My invalid tag.

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidNestedTag(t *testing.T) { + input := `

My invalid tag with some valid tag.

` + expected := `

My invalid tag with some valid tag.

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidIFrame(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.com/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestIFrameWithChildElements(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.com/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestInvalidURLScheme(t *testing.T) { + input := `

This link is not valid

` + expected := `

This link is not valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestAPTURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestBitcoinURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestCallToURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestFeedURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } + + input = `

This link is valid

` + expected = `

This link is valid

` + output = Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestGeoURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestItunesURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } + + input = `

This link is valid

` + expected = `

This link is valid

` + output = Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestMagnetURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestMailtoURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestNewsURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } + + input = `

This link is valid

` + expected = `

This link is valid

` + output = Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } + + input = `

This link is valid

` + expected = `

This link is valid

` + output = Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestRTMPURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestSIPURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } + + input = `

This link is valid

` + expected = `

This link is valid

` + output = Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestSkypeURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestSpotifyURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestSteamURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestSubversionURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } + + input = `

This link is valid

` + expected = `

This link is valid

` + output = Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestTelURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestWebcalURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestXMPPURIScheme(t *testing.T) { + input := `

This link is valid

` + expected := `

This link is valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestBlacklistedLink(t *testing.T) { + input := `

This image is not valid

` + expected := `

This image is not valid

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestPixelTracker(t *testing.T) { + input := `

and

` + expected := `

and

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestXmlEntities(t *testing.T) { + input := `
echo "test" > /etc/hosts
` + expected := `
echo "test" > /etc/hosts
` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestEspaceAttributes(t *testing.T) { + input := `test` + expected := `test` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceYoutubeURL(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceSecureYoutubeURL(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceSecureYoutubeURLWithParameters(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceYoutubeURLAlreadyReplaced(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceProtocolRelativeYoutubeURL(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceIframeURL(t *testing.T) { + input := `` + expected := `` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceNoScript(t *testing.T) { + input := `

Before paragraph.

After paragraph.

` + expected := `

Before paragraph.

After paragraph.

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceScript(t *testing.T) { + input := `

Before paragraph.

After paragraph.

` + expected := `

Before paragraph.

After paragraph.

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceStyle(t *testing.T) { + input := `

Before paragraph.

After paragraph.

` + expected := `

Before paragraph.

After paragraph.

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} diff --git a/src/scraper/utils.go b/src/scraper/utils.go index dba54b3..3a745ea 100644 --- a/src/scraper/utils.go +++ b/src/scraper/utils.go @@ -65,3 +65,10 @@ func absoluteUrl(href, base string) string { } return baseUrl.ResolveReference(hrefUrl).String() } + +func urlDomain(val string) string { + if u, err := url.Parse(val); err == nil { + return u.Host + } + return val +}