reorganizing content-related packages

This commit is contained in:
Nazar Kanaev 2021-03-31 23:40:59 +01:00
parent 0b8bf50204
commit b04e8c1e93
12 changed files with 19 additions and 18 deletions

View File

@ -1,10 +1,10 @@
package scraper
package htmlutil
import (
"net/url"
)
func any(els []string, el string, match func(string, string) bool) bool {
func Any(els []string, el string, match func(string, string) bool) bool {
for _, x := range els {
if match(x, el) {
return true
@ -13,7 +13,7 @@ func any(els []string, el string, match func(string, string) bool) bool {
return false
}
func absoluteUrl(href, base string) string {
func AbsoluteUrl(href, base string) string {
baseUrl, err := url.Parse(base)
if err != nil {
return ""
@ -25,7 +25,7 @@ func absoluteUrl(href, base string) string {
return baseUrl.ResolveReference(hrefUrl).String()
}
func urlDomain(val string) string {
func URLDomain(val string) string {
if u, err := url.Parse(val); err == nil {
return u.Host
}

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package reader
package readability
import (
"bytes"
@ -13,7 +13,7 @@ import (
"regexp"
"strings"
"github.com/nkanaev/yarr/src/htmlutil"
"github.com/nkanaev/yarr/src/content/htmlutil"
"golang.org/x/net/html"
)

View File

@ -12,6 +12,7 @@ import (
"strconv"
"strings"
"github.com/nkanaev/yarr/src/content/htmlutil"
"golang.org/x/net/html"
)
@ -116,7 +117,7 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
} else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
value = attribute.Val
} else {
value = absoluteUrl(value, baseURL)
value = htmlutil.AbsoluteUrl(value, baseURL)
if value == "" {
continue
}
@ -294,9 +295,9 @@ func isValidIframeSource(baseURL, src string) bool {
"www.youtube.com",
}
domain := urlDomain(src)
domain := htmlutil.URLDomain(src)
// allow iframe from same origin
if urlDomain(baseURL) == domain {
if htmlutil.URLDomain(baseURL) == domain {
return true
}
@ -416,7 +417,7 @@ func sanitizeSrcsetAttr(baseURL, value string) string {
if nbParts > 0 {
sanitizedSource := parts[0]
if !strings.HasPrefix(parts[0], "data:") {
sanitizedSource = absoluteUrl(parts[0], baseURL)
sanitizedSource = htmlutil.AbsoluteUrl(parts[0], baseURL)
if sanitizedSource == "" {
continue
}

View File

@ -3,7 +3,7 @@ package scraper
import (
"strings"
"github.com/nkanaev/yarr/src/htmlutil"
"github.com/nkanaev/yarr/src/content/htmlutil"
"golang.org/x/net/html"
)
@ -32,7 +32,7 @@ func FindFeeds(body string, base string) map[string]string {
for _, node := range htmlutil.FindNodes(doc, isFeedLink) {
href := htmlutil.Attr(node, "href")
name := htmlutil.Attr(node, "title")
link := absoluteUrl(href, base)
link := htmlutil.AbsoluteUrl(href, base)
if link != "" {
candidates[link] = name
}
@ -63,7 +63,7 @@ func FindFeeds(body string, base string) map[string]string {
}
for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) {
href := htmlutil.Attr(node, "href")
link := absoluteUrl(href, base)
link := htmlutil.AbsoluteUrl(href, base)
if link != "" {
candidates[link] = ""
}
@ -89,7 +89,7 @@ func FindIcons(body string, base string) []string {
rels := strings.Split(htmlutil.Attr(node, "rel"), " ")
for _, rel := range rels {
if strings.EqualFold(rel, "icon") {
icons = append(icons, absoluteUrl(htmlutil.Attr(node, "href"), base))
icons = append(icons, htmlutil.AbsoluteUrl(htmlutil.Attr(node, "href"), base))
}
}
}

View File

@ -9,10 +9,10 @@ import (
"github.com/nkanaev/yarr/src/assets"
"github.com/nkanaev/yarr/src/auth"
"github.com/nkanaev/yarr/src/content/readability"
"github.com/nkanaev/yarr/src/content/sanitizer"
"github.com/nkanaev/yarr/src/opml"
"github.com/nkanaev/yarr/src/reader"
"github.com/nkanaev/yarr/src/router"
"github.com/nkanaev/yarr/src/scraper"
"github.com/nkanaev/yarr/src/storage"
"github.com/nkanaev/yarr/src/worker"
)
@ -419,7 +419,7 @@ func (s *Server) handlePageCrawl(c *router.Context) {
return
}
defer res.Body.Close()
content, err := reader.ExtractContent(res.Body)
content, err := readability.ExtractContent(res.Body)
if err != nil {
log.Print(err)
c.Out.WriteHeader(http.StatusNoContent)

View File

@ -8,7 +8,7 @@ import (
"net/http"
"net/url"
"github.com/nkanaev/yarr/src/scraper"
"github.com/nkanaev/yarr/src/content/scraper"
"github.com/nkanaev/yarr/src/parser"
"github.com/nkanaev/yarr/src/storage"
"golang.org/x/net/html/charset"