diff --git a/src/htmlutil/query.go b/src/htmlutil/query.go
index 3939c3c..bf17644 100644
--- a/src/htmlutil/query.go
+++ b/src/htmlutil/query.go
@@ -7,7 +7,7 @@ import (
"golang.org/x/net/html"
)
-var nodeNameRegex = regexp.MustCompile(`\w+`)
+var nodeNameRegex = regexp.MustCompile(`\w+|\*`)
func FindNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
nodes := make([]*html.Node, 0)
@@ -40,7 +40,7 @@ func NewMatcher(sel string) Matcher {
if nodeNameRegex.MatchString(part) {
multi.Add(ElementMatch{Name: part})
} else {
- panic("unsupported selector")
+ panic("unsupported selector: " + part)
}
}
return multi
@@ -55,7 +55,7 @@ type ElementMatch struct {
}
func (m ElementMatch) Match(n *html.Node) bool {
- return n.Type == html.ElementNode && n.Data == m.Name
+ return n.Type == html.ElementNode && (n.Data == m.Name || m.Name == "*")
}
type MultiMatch struct {
diff --git a/src/htmlutil/utils.go b/src/htmlutil/utils.go
index 5ef1b02..fab1b25 100644
--- a/src/htmlutil/utils.go
+++ b/src/htmlutil/utils.go
@@ -12,6 +12,14 @@ func HTML(node *html.Node) string {
return writer.String()
}
+func InnerHTML(node *html.Node) string {
+ writer := strings.Builder{}
+ for c := node.FirstChild; c != nil; c = c.NextSibling {
+ html.Render(&writer, c)
+ }
+ return writer.String()
+}
+
func Attr(node *html.Node, key string) string {
for _, a := range node.Attr {
if a.Key == key {
diff --git a/src/reader/readability.go b/src/reader/readability.go
index aed2b24..174e65c 100644
--- a/src/reader/readability.go
+++ b/src/reader/readability.go
@@ -83,8 +83,8 @@ func ExtractContent(page io.Reader) (string, error) {
}
}
- transformMisusedDivsIntoParagraphs(document)
- removeUnlikelyCandidates(document)
+ transformMisusedDivsIntoParagraphs(root)
+ removeUnlikelyCandidates(root)
candidates := getCandidates(document)
//log.Printf("[Readability] Candidates: %v", candidates)
@@ -139,19 +139,22 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
return output.String()
}
-func removeUnlikelyCandidates(document *goquery.Document) {
- document.Find("*").Not("html,body").Each(func(i int, s *goquery.Selection) {
- class, _ := s.Attr("class")
- id, _ := s.Attr("id")
- str := class + id
+func removeUnlikelyCandidates(root *html.Node) {
+ body := htmlutil.Query(root, "body")
+ if len(body) == 0 {
+ return
+ }
+ for _, node := range htmlutil.Query(body[0], "*") {
+ str := htmlutil.Attr(node, "class") + htmlutil.Attr(node, "id")
- if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
- node := s.Get(0)
- if node.Parent != nil {
- node.Parent.RemoveChild(node)
- }
+ blacklisted := (
+ blacklistCandidatesRegexp.MatchString(str) ||
+ (unlikelyCandidatesRegexp.MatchString(str) &&
+ !okMaybeItsACandidateRegexp.MatchString(str)))
+ if blacklisted && node.Parent != nil {
+ node.Parent.RemoveChild(node)
}
- })
+ }
}
func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {
@@ -292,12 +295,10 @@ func getClassWeight(s *goquery.Selection) float32 {
return float32(weight)
}
-func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
- document.Find("div").Each(func(i int, s *goquery.Selection) {
- html, _ := s.Html()
- if !divToPElementsRegexp.MatchString(html) {
- node := s.Get(0)
+func transformMisusedDivsIntoParagraphs(root *html.Node) {
+ for _, node := range htmlutil.Query(root, "div") {
+ if !divToPElementsRegexp.MatchString(htmlutil.InnerHTML(node)) {
node.Data = "p"
}
- })
+ }
}