diff --git a/src/htmlutil/query.go b/src/content/htmlutil/query.go similarity index 100% rename from src/htmlutil/query.go rename to src/content/htmlutil/query.go diff --git a/src/htmlutil/query_test.go b/src/content/htmlutil/query_test.go similarity index 100% rename from src/htmlutil/query_test.go rename to src/content/htmlutil/query_test.go diff --git a/src/scraper/utils.go b/src/content/htmlutil/urlutils.go similarity index 70% rename from src/scraper/utils.go rename to src/content/htmlutil/urlutils.go index 6db70f8..f5c02ac 100644 --- a/src/scraper/utils.go +++ b/src/content/htmlutil/urlutils.go @@ -1,10 +1,10 @@ -package scraper +package htmlutil import ( "net/url" ) -func any(els []string, el string, match func(string, string) bool) bool { +func Any(els []string, el string, match func(string, string) bool) bool { for _, x := range els { if match(x, el) { return true @@ -13,7 +13,7 @@ func any(els []string, el string, match func(string, string) bool) bool { return false } -func absoluteUrl(href, base string) string { +func AbsoluteUrl(href, base string) string { baseUrl, err := url.Parse(base) if err != nil { return "" @@ -25,7 +25,7 @@ func absoluteUrl(href, base string) string { return baseUrl.ResolveReference(hrefUrl).String() } -func urlDomain(val string) string { +func URLDomain(val string) string { if u, err := url.Parse(val); err == nil { return u.Host } diff --git a/src/htmlutil/utils.go b/src/content/htmlutil/utils.go similarity index 100% rename from src/htmlutil/utils.go rename to src/content/htmlutil/utils.go diff --git a/src/reader/LICENSE b/src/content/readability/LICENSE similarity index 100% rename from src/reader/LICENSE rename to src/content/readability/LICENSE diff --git a/src/reader/readability.go b/src/content/readability/readability.go similarity index 99% rename from src/reader/readability.go rename to src/content/readability/readability.go index 992a5e7..37b7304 100644 --- a/src/reader/readability.go +++ b/src/content/readability/readability.go @@ -2,7 +2,7 @@ // Use of this source code is governed by the Apache 2.0 // license that can be found in the LICENSE file. -package reader +package readability import ( "bytes" @@ -13,7 +13,7 @@ import ( "regexp" "strings" - "github.com/nkanaev/yarr/src/htmlutil" + "github.com/nkanaev/yarr/src/content/htmlutil" "golang.org/x/net/html" ) diff --git a/src/scraper/sanitizer.go b/src/content/sanitizer/sanitizer.go similarity index 97% rename from src/scraper/sanitizer.go rename to src/content/sanitizer/sanitizer.go index 1f81009..32b3ce1 100644 --- a/src/scraper/sanitizer.go +++ b/src/content/sanitizer/sanitizer.go @@ -12,6 +12,7 @@ import ( "strconv" "strings" + "github.com/nkanaev/yarr/src/content/htmlutil" "golang.org/x/net/html" ) @@ -116,7 +117,7 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([ } else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) { value = attribute.Val } else { - value = absoluteUrl(value, baseURL) + value = htmlutil.AbsoluteUrl(value, baseURL) if value == "" { continue } @@ -294,9 +295,9 @@ func isValidIframeSource(baseURL, src string) bool { "www.youtube.com", } - domain := urlDomain(src) + domain := htmlutil.URLDomain(src) // allow iframe from same origin - if urlDomain(baseURL) == domain { + if htmlutil.URLDomain(baseURL) == domain { return true } @@ -416,7 +417,7 @@ func sanitizeSrcsetAttr(baseURL, value string) string { if nbParts > 0 { sanitizedSource := parts[0] if !strings.HasPrefix(parts[0], "data:") { - sanitizedSource = absoluteUrl(parts[0], baseURL) + sanitizedSource = htmlutil.AbsoluteUrl(parts[0], baseURL) if sanitizedSource == "" { continue } diff --git a/src/scraper/sanitizer_test.go b/src/content/sanitizer/sanitizer_test.go similarity index 100% rename from src/scraper/sanitizer_test.go rename to src/content/sanitizer/sanitizer_test.go diff --git a/src/scraper/finder.go b/src/content/scraper/finder.go similarity index 90% rename from src/scraper/finder.go rename to src/content/scraper/finder.go index 96faf58..fa1a9af 100644 --- a/src/scraper/finder.go +++ b/src/content/scraper/finder.go @@ -3,7 +3,7 @@ package scraper import ( "strings" - "github.com/nkanaev/yarr/src/htmlutil" + "github.com/nkanaev/yarr/src/content/htmlutil" "golang.org/x/net/html" ) @@ -32,7 +32,7 @@ func FindFeeds(body string, base string) map[string]string { for _, node := range htmlutil.FindNodes(doc, isFeedLink) { href := htmlutil.Attr(node, "href") name := htmlutil.Attr(node, "title") - link := absoluteUrl(href, base) + link := htmlutil.AbsoluteUrl(href, base) if link != "" { candidates[link] = name } @@ -63,7 +63,7 @@ func FindFeeds(body string, base string) map[string]string { } for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) { href := htmlutil.Attr(node, "href") - link := absoluteUrl(href, base) + link := htmlutil.AbsoluteUrl(href, base) if link != "" { candidates[link] = "" } @@ -89,7 +89,7 @@ func FindIcons(body string, base string) []string { rels := strings.Split(htmlutil.Attr(node, "rel"), " ") for _, rel := range rels { if strings.EqualFold(rel, "icon") { - icons = append(icons, absoluteUrl(htmlutil.Attr(node, "href"), base)) + icons = append(icons, htmlutil.AbsoluteUrl(htmlutil.Attr(node, "href"), base)) } } } diff --git a/src/scraper/finder_test.go b/src/content/scraper/finder_test.go similarity index 100% rename from src/scraper/finder_test.go rename to src/content/scraper/finder_test.go diff --git a/src/server/routes.go b/src/server/routes.go index 2f9034b..0d236d3 100644 --- a/src/server/routes.go +++ b/src/server/routes.go @@ -9,10 +9,10 @@ import ( "github.com/nkanaev/yarr/src/assets" "github.com/nkanaev/yarr/src/auth" + "github.com/nkanaev/yarr/src/content/readability" + "github.com/nkanaev/yarr/src/content/sanitizer" "github.com/nkanaev/yarr/src/opml" - "github.com/nkanaev/yarr/src/reader" "github.com/nkanaev/yarr/src/router" - "github.com/nkanaev/yarr/src/scraper" "github.com/nkanaev/yarr/src/storage" "github.com/nkanaev/yarr/src/worker" ) @@ -419,7 +419,7 @@ func (s *Server) handlePageCrawl(c *router.Context) { return } defer res.Body.Close() - content, err := reader.ExtractContent(res.Body) + content, err := readability.ExtractContent(res.Body) if err != nil { log.Print(err) c.Out.WriteHeader(http.StatusNoContent) diff --git a/src/worker/crawler.go b/src/worker/crawler.go index b05a2c6..508d3cd 100644 --- a/src/worker/crawler.go +++ b/src/worker/crawler.go @@ -8,7 +8,7 @@ import ( "net/http" "net/url" - "github.com/nkanaev/yarr/src/scraper" + "github.com/nkanaev/yarr/src/content/scraper" "github.com/nkanaev/yarr/src/parser" "github.com/nkanaev/yarr/src/storage" "golang.org/x/net/html/charset"