diff --git a/go.mod b/go.mod index cbc46b2..c55fecf 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/nkanaev/yarr go 1.16 require ( - github.com/PuerkitoBio/goquery v1.5.1 github.com/mattn/go-sqlite3 v1.14.0 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e golang.org/x/sys v0.0.0-20201018230417-eeed37f84f13 diff --git a/go.sum b/go.sum index 02859e5..054e8f1 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,4 @@ -github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= -github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA= github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus= diff --git a/src/reader/readability.go b/src/reader/readability.go index 8442f49..992a5e7 100644 --- a/src/reader/readability.go +++ b/src/reader/readability.go @@ -14,7 +14,6 @@ import ( "strings" "github.com/nkanaev/yarr/src/htmlutil" - "github.com/PuerkitoBio/goquery" "golang.org/x/net/html" ) @@ -34,51 +33,15 @@ var ( positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`) ) -type candidate struct { - selection *goquery.Selection - score float32 -} - -func (c *candidate) Node() *html.Node { - return c.selection.Get(0) -} - -type scorelist map[*html.Node]float32 - -func (c *candidate) String() string { - id, _ := c.selection.Attr("id") - class, _ := c.selection.Attr("class") - - if id != "" && class != "" { - return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score) - } else if id != "" { - return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score) - } else if class != "" { - return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score) - } - - return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score) -} - -type candidateList map[*html.Node]*candidate - -func (c candidateList) String() string { - var output []string - for _, candidate := range c { - output = append(output, candidate.String()) - } - - return strings.Join(output, ", ") -} +type nodeScores map[*html.Node]float32 // ExtractContent returns relevant content. func ExtractContent(page io.Reader) (string, error) { - document, err := goquery.NewDocumentFromReader(page) + root, err := html.Parse(page) if err != nil { return "", err } - root := document.Get(0) for _, trash := range htmlutil.Query(root, "script,style") { if trash.Parent != nil { trash.Parent.RemoveChild(trash) @@ -97,50 +60,56 @@ func ExtractContent(page io.Reader) (string, error) { } //log.Printf("[Readability] TopCandidate: %v", topCandidate) - output := getArticle(root, best, scores) + output := getArticle(best, scores) return output, nil } // Now that we have the top candidate, look through its siblings for content that might also be related. // Things like preambles, content split by ads that we removed, etc. -func getArticle(root, best *html.Node, scores scorelist) string { - selection := goquery.NewDocumentFromNode(root).FindNodes(best) - topCandidate := &candidate{selection: selection, score: scores[best]} +func getArticle(best *html.Node, scores nodeScores) string { output := bytes.NewBufferString("