mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-25 13:39:22 +00:00
reorganizing content-related packages
This commit is contained in:
parent
0b8bf50204
commit
b04e8c1e93
@ -1,10 +1,10 @@
|
|||||||
package scraper
|
package htmlutil
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"net/url"
|
"net/url"
|
||||||
)
|
)
|
||||||
|
|
||||||
func any(els []string, el string, match func(string, string) bool) bool {
|
func Any(els []string, el string, match func(string, string) bool) bool {
|
||||||
for _, x := range els {
|
for _, x := range els {
|
||||||
if match(x, el) {
|
if match(x, el) {
|
||||||
return true
|
return true
|
||||||
@ -13,7 +13,7 @@ func any(els []string, el string, match func(string, string) bool) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func absoluteUrl(href, base string) string {
|
func AbsoluteUrl(href, base string) string {
|
||||||
baseUrl, err := url.Parse(base)
|
baseUrl, err := url.Parse(base)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ""
|
return ""
|
||||||
@ -25,7 +25,7 @@ func absoluteUrl(href, base string) string {
|
|||||||
return baseUrl.ResolveReference(hrefUrl).String()
|
return baseUrl.ResolveReference(hrefUrl).String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func urlDomain(val string) string {
|
func URLDomain(val string) string {
|
||||||
if u, err := url.Parse(val); err == nil {
|
if u, err := url.Parse(val); err == nil {
|
||||||
return u.Host
|
return u.Host
|
||||||
}
|
}
|
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by the Apache 2.0
|
// Use of this source code is governed by the Apache 2.0
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
package reader
|
package readability
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
@ -13,7 +13,7 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/nkanaev/yarr/src/htmlutil"
|
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
@ -12,6 +12,7 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -116,7 +117,7 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
|
|||||||
} else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
|
} else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
|
||||||
value = attribute.Val
|
value = attribute.Val
|
||||||
} else {
|
} else {
|
||||||
value = absoluteUrl(value, baseURL)
|
value = htmlutil.AbsoluteUrl(value, baseURL)
|
||||||
if value == "" {
|
if value == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -294,9 +295,9 @@ func isValidIframeSource(baseURL, src string) bool {
|
|||||||
"www.youtube.com",
|
"www.youtube.com",
|
||||||
}
|
}
|
||||||
|
|
||||||
domain := urlDomain(src)
|
domain := htmlutil.URLDomain(src)
|
||||||
// allow iframe from same origin
|
// allow iframe from same origin
|
||||||
if urlDomain(baseURL) == domain {
|
if htmlutil.URLDomain(baseURL) == domain {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -416,7 +417,7 @@ func sanitizeSrcsetAttr(baseURL, value string) string {
|
|||||||
if nbParts > 0 {
|
if nbParts > 0 {
|
||||||
sanitizedSource := parts[0]
|
sanitizedSource := parts[0]
|
||||||
if !strings.HasPrefix(parts[0], "data:") {
|
if !strings.HasPrefix(parts[0], "data:") {
|
||||||
sanitizedSource = absoluteUrl(parts[0], baseURL)
|
sanitizedSource = htmlutil.AbsoluteUrl(parts[0], baseURL)
|
||||||
if sanitizedSource == "" {
|
if sanitizedSource == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
@ -3,7 +3,7 @@ package scraper
|
|||||||
import (
|
import (
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/nkanaev/yarr/src/htmlutil"
|
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -32,7 +32,7 @@ func FindFeeds(body string, base string) map[string]string {
|
|||||||
for _, node := range htmlutil.FindNodes(doc, isFeedLink) {
|
for _, node := range htmlutil.FindNodes(doc, isFeedLink) {
|
||||||
href := htmlutil.Attr(node, "href")
|
href := htmlutil.Attr(node, "href")
|
||||||
name := htmlutil.Attr(node, "title")
|
name := htmlutil.Attr(node, "title")
|
||||||
link := absoluteUrl(href, base)
|
link := htmlutil.AbsoluteUrl(href, base)
|
||||||
if link != "" {
|
if link != "" {
|
||||||
candidates[link] = name
|
candidates[link] = name
|
||||||
}
|
}
|
||||||
@ -63,7 +63,7 @@ func FindFeeds(body string, base string) map[string]string {
|
|||||||
}
|
}
|
||||||
for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) {
|
for _, node := range htmlutil.FindNodes(doc, isFeedHyperLink) {
|
||||||
href := htmlutil.Attr(node, "href")
|
href := htmlutil.Attr(node, "href")
|
||||||
link := absoluteUrl(href, base)
|
link := htmlutil.AbsoluteUrl(href, base)
|
||||||
if link != "" {
|
if link != "" {
|
||||||
candidates[link] = ""
|
candidates[link] = ""
|
||||||
}
|
}
|
||||||
@ -89,7 +89,7 @@ func FindIcons(body string, base string) []string {
|
|||||||
rels := strings.Split(htmlutil.Attr(node, "rel"), " ")
|
rels := strings.Split(htmlutil.Attr(node, "rel"), " ")
|
||||||
for _, rel := range rels {
|
for _, rel := range rels {
|
||||||
if strings.EqualFold(rel, "icon") {
|
if strings.EqualFold(rel, "icon") {
|
||||||
icons = append(icons, absoluteUrl(htmlutil.Attr(node, "href"), base))
|
icons = append(icons, htmlutil.AbsoluteUrl(htmlutil.Attr(node, "href"), base))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -9,10 +9,10 @@ import (
|
|||||||
|
|
||||||
"github.com/nkanaev/yarr/src/assets"
|
"github.com/nkanaev/yarr/src/assets"
|
||||||
"github.com/nkanaev/yarr/src/auth"
|
"github.com/nkanaev/yarr/src/auth"
|
||||||
|
"github.com/nkanaev/yarr/src/content/readability"
|
||||||
|
"github.com/nkanaev/yarr/src/content/sanitizer"
|
||||||
"github.com/nkanaev/yarr/src/opml"
|
"github.com/nkanaev/yarr/src/opml"
|
||||||
"github.com/nkanaev/yarr/src/reader"
|
|
||||||
"github.com/nkanaev/yarr/src/router"
|
"github.com/nkanaev/yarr/src/router"
|
||||||
"github.com/nkanaev/yarr/src/scraper"
|
|
||||||
"github.com/nkanaev/yarr/src/storage"
|
"github.com/nkanaev/yarr/src/storage"
|
||||||
"github.com/nkanaev/yarr/src/worker"
|
"github.com/nkanaev/yarr/src/worker"
|
||||||
)
|
)
|
||||||
@ -419,7 +419,7 @@ func (s *Server) handlePageCrawl(c *router.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer res.Body.Close()
|
defer res.Body.Close()
|
||||||
content, err := reader.ExtractContent(res.Body)
|
content, err := readability.ExtractContent(res.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
c.Out.WriteHeader(http.StatusNoContent)
|
c.Out.WriteHeader(http.StatusNoContent)
|
||||||
|
@ -8,7 +8,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
|
||||||
"github.com/nkanaev/yarr/src/scraper"
|
"github.com/nkanaev/yarr/src/content/scraper"
|
||||||
"github.com/nkanaev/yarr/src/parser"
|
"github.com/nkanaev/yarr/src/parser"
|
||||||
"github.com/nkanaev/yarr/src/storage"
|
"github.com/nkanaev/yarr/src/storage"
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user