diff --git a/src/htmlutil/query.go b/src/htmlutil/query.go index 3939c3c..bf17644 100644 --- a/src/htmlutil/query.go +++ b/src/htmlutil/query.go @@ -7,7 +7,7 @@ import ( "golang.org/x/net/html" ) -var nodeNameRegex = regexp.MustCompile(`\w+`) +var nodeNameRegex = regexp.MustCompile(`\w+|\*`) func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node { nodes := make([]*html.Node, 0) @@ -40,7 +40,7 @@ func NewMatcher(sel string) Matcher { if nodeNameRegex.MatchString(part) { multi.Add(ElementMatch{Name: part}) } else { - panic("unsupported selector") + panic("unsupported selector: " + part) } } return multi @@ -55,7 +55,7 @@ type ElementMatch struct { } func (m ElementMatch) Match(n *html.Node) bool { - return n.Type == html.ElementNode && n.Data == m.Name + return n.Type == html.ElementNode && (n.Data == m.Name || m.Name == "*") } type MultiMatch struct { diff --git a/src/htmlutil/utils.go b/src/htmlutil/utils.go index 5ef1b02..fab1b25 100644 --- a/src/htmlutil/utils.go +++ b/src/htmlutil/utils.go @@ -12,6 +12,14 @@ func HTML(node *html.Node) string { return writer.String() } +func InnerHTML(node *html.Node) string { + writer := strings.Builder{} + for c := node.FirstChild; c != nil; c = c.NextSibling { + html.Render(&writer, c) + } + return writer.String() +} + func Attr(node *html.Node, key string) string { for _, a := range node.Attr { if a.Key == key { diff --git a/src/reader/readability.go b/src/reader/readability.go index aed2b24..174e65c 100644 --- a/src/reader/readability.go +++ b/src/reader/readability.go @@ -83,8 +83,8 @@ func ExtractContent(page io.Reader) (string, error) { } } - transformMisusedDivsIntoParagraphs(document) - removeUnlikelyCandidates(document) + transformMisusedDivsIntoParagraphs(root) + removeUnlikelyCandidates(root) candidates := getCandidates(document) //log.Printf("[Readability] Candidates: %v", candidates) @@ -139,19 +139,22 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { return output.String() } -func removeUnlikelyCandidates(document *goquery.Document) { - document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) { - class, _ := s.Attr("class") - id, _ := s.Attr("id") - str := class + id +func removeUnlikelyCandidates(root *html.Node) { + body := htmlutil.Query(root, "body") + if len(body) == 0 { + return + } + for _, node := range htmlutil.Query(body[0], "*") { + str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id") - if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) { - node := s.Get(0) - if node.Parent != nil { - node.Parent.RemoveChild(node) - } + blacklisted := ( + blacklistCandidatesRegexp.MatchString(str) || + (unlikelyCandidatesRegexp.MatchString(str) && + !okMaybeItsACandidateRegexp.MatchString(str))) + if blacklisted && node.Parent != nil { + node.Parent.RemoveChild(node) } - }) + } } func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate { @@ -292,12 +295,10 @@ func getClassWeight(s *goquery.Selection) float32 { return float32(weight) } -func transformMisusedDivsIntoParagraphs(document *goquery.Document) { - document.Find("div").Each(func(i int, s *goquery.Selection) { - html, _ := s.Html() - if !divToPElementsRegexp.MatchString(html) { - node := s.Get(0) +func transformMisusedDivsIntoParagraphs(root *html.Node) { + for _, node := range htmlutil.Query(root, "div") { + if !divToPElementsRegexp.MatchString(htmlutil.InnerHTML(node)) { node.Data = "p" } - }) + } }