mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
reorganizing content-related packages
This commit is contained in:
parent
0b8bf50204
commit
b04e8c1e93
@ -1,10 +1,10 @@
|
||||
package scraper
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
)
|
||||
|
||||
func any(els []string, el string, match func(string, string) bool) bool {
|
||||
func Any(els []string, el string, match func(string, string) bool) bool {
|
||||
for _, x := range els {
|
||||
if match(x, el) {
|
||||
return true
|
||||
@ -13,7 +13,7 @@ func any(els []string, el string, match func(string, string) bool) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func absoluteUrl(href, base string) string {
|
||||
func AbsoluteUrl(href, base string) string {
|
||||
baseUrl, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return ""
|
||||
@ -25,7 +25,7 @@ func absoluteUrl(href, base string) string {
|
||||
return baseUrl.ResolveReference(hrefUrl).String()
|
||||
}
|
||||
|
||||
func urlDomain(val string) string {
|
||||
func URLDomain(val string) string {
|
||||
if u, err := url.Parse(val); err == nil {
|
||||
return u.Host
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package reader
|
||||
package readability
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
@ -13,7 +13,7 @@ import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/nkanaev/yarr/src/htmlutil"
|
||||
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
@ -12,6 +12,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
@ -116,7 +117,7 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
|
||||
} else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
|
||||
value = attribute.Val
|
||||
} else {
|
||||
value = absoluteUrl(value, baseURL)
|
||||
value = htmlutil.AbsoluteUrl(value, baseURL)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
@ -294,9 +295,9 @@ func isValidIframeSource(baseURL, src string) bool {
|
||||
"www.youtube.com",
|
||||
}
|
||||
|
||||
domain := urlDomain(src)
|
||||
domain := htmlutil.URLDomain(src)
|
||||
// allow iframe from same origin
|
||||
if urlDomain(baseURL) == domain {
|
||||
if htmlutil.URLDomain(baseURL) == domain {
|
||||
return true
|
||||
}
|
||||
|
||||
@ -416,7 +417,7 @@ func sanitizeSrcsetAttr(baseURL, value string) string {
|
||||
if nbParts > 0 {
|
||||
sanitizedSource := parts[0]
|
||||
if !strings.HasPrefix(parts[0], "data:") {
|
||||
sanitizedSource = absoluteUrl(parts[0], baseURL)
|
||||
sanitizedSource = htmlutil.AbsoluteUrl(parts[0], baseURL)
|
||||
if sanitizedSource == "" {
|
||||
continue
|
||||
}
|
@ -3,7 +3,7 @@ package scraper
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/nkanaev/yarr/src/htmlutil"
|
||||
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
@ -32,7 +32,7 @@ func FindFeeds(body string, base string) map[string]string {
|
||||
for _, node := range htmlutil.FindNodes(doc, isFeedLink) {
|
||||
href := htmlutil.Attr(node, "href")
|
||||
name := htmlutil.Attr(node, "title")
|
||||
link := absoluteUrl(href, base)
|
||||
link := htmlutil.AbsoluteUrl(href, base)
|
||||
if link != "" {
|
||||
candidates[link] = name
|
||||
}
|
||||
@ -63,7 +63,7 @@ func FindFeeds(body string, base string) map[string]string {
|
||||
}
|
||||
for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) {
|
||||
href := htmlutil.Attr(node, "href")
|
||||
link := absoluteUrl(href, base)
|
||||
link := htmlutil.AbsoluteUrl(href, base)
|
||||
if link != "" {
|
||||
candidates[link] = ""
|
||||
}
|
||||
@ -89,7 +89,7 @@ func FindIcons(body string, base string) []string {
|
||||
rels := strings.Split(htmlutil.Attr(node, "rel"), " ")
|
||||
for _, rel := range rels {
|
||||
if strings.EqualFold(rel, "icon") {
|
||||
icons = append(icons, absoluteUrl(htmlutil.Attr(node, "href"), base))
|
||||
icons = append(icons, htmlutil.AbsoluteUrl(htmlutil.Attr(node, "href"), base))
|
||||
}
|
||||
}
|
||||
}
|
@ -9,10 +9,10 @@ import (
|
||||
|
||||
"github.com/nkanaev/yarr/src/assets"
|
||||
"github.com/nkanaev/yarr/src/auth"
|
||||
"github.com/nkanaev/yarr/src/content/readability"
|
||||
"github.com/nkanaev/yarr/src/content/sanitizer"
|
||||
"github.com/nkanaev/yarr/src/opml"
|
||||
"github.com/nkanaev/yarr/src/reader"
|
||||
"github.com/nkanaev/yarr/src/router"
|
||||
"github.com/nkanaev/yarr/src/scraper"
|
||||
"github.com/nkanaev/yarr/src/storage"
|
||||
"github.com/nkanaev/yarr/src/worker"
|
||||
)
|
||||
@ -419,7 +419,7 @@ func (s *Server) handlePageCrawl(c *router.Context) {
|
||||
return
|
||||
}
|
||||
defer res.Body.Close()
|
||||
content, err := reader.ExtractContent(res.Body)
|
||||
content, err := readability.ExtractContent(res.Body)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
c.Out.WriteHeader(http.StatusNoContent)
|
||||
|
@ -8,7 +8,7 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
|
||||
"github.com/nkanaev/yarr/src/scraper"
|
||||
"github.com/nkanaev/yarr/src/content/scraper"
|
||||
"github.com/nkanaev/yarr/src/parser"
|
||||
"github.com/nkanaev/yarr/src/storage"
|
||||
"golang.org/x/net/html/charset"
|
||||
|
Loading…
x
Reference in New Issue
Block a user