handle html type atom text

2025-12-14 08:07:20 +00:00 · 2021-04-02 21:46:23 +01:00
parent 8967936fb6
commit e50c7e1a51
5 changed files with 45 additions and 26 deletions
--- a/src/content/htmlutil/utils.go
+++ b/src/content/htmlutil/utils.go
@@ -1,6 +1,7 @@
 package htmlutil

 import (
+	"bytes"
 	"strings"

 	"golang.org/x/net/html"
@@ -39,3 +40,18 @@ func Text(node *html.Node) string {
 	}
 	return strings.Join(text, " ")
 }
+
+func ExtractText(content string) string {
+	tokenizer := html.NewTokenizer(strings.NewReader(content))
+	buffer := bytes.Buffer{}
+	for {
+		token := tokenizer.Next()
+		if token == html.ErrorToken {
+			break
+		}
+		if token == html.TextToken {
+			buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
+		}
+	}
+	return buffer.String()
+}
--- a/src/parser/atom.go
+++ b/src/parser/atom.go
@@ -6,6 +6,8 @@ import (
 	"html"
 	"io"
 	"strings"
+
+	"github.com/nkanaev/yarr/src/content/htmlutil"
 )

 type atomFeed struct {
@@ -42,6 +44,13 @@ type atomLink struct {

 type atomLinks []atomLink

+func (a *atomText) Text() string {
+	if a.Type == "html" {
+		return htmlutil.ExtractText(a.Data)
+	}
+	return a.Data
+}
+
 func (a *atomText) String() string {
 	data := a.Data
 	if a.Type == "xhtml" {
@@ -76,7 +85,7 @@ func ParseAtom(r io.Reader) (*Feed, error) {
 			GUID:     firstNonEmpty(srcitem.ID),
 			Date:     dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
 			URL:      firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")),
-			Title:    srcitem.Title.String(),
+			Title:    srcitem.Title.Text(),
 			Content:  firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()),
 			ImageURL: srcitem.firstMediaThumbnail(),
 			AudioURL: "",
--- a/src/parser/atom_test.go
+++ b/src/parser/atom_test.go
@@ -77,3 +77,19 @@ func TestAtomClashingNamespaces(t *testing.T) {
 		t.FailNow()
 	}
 }
+
+func TestAtomHTMLTitle(t *testing.T) {
+	feed, _ := Parse(strings.NewReader(`
+		<?xml version="1.0" encoding="utf-8"?>
+		<feed xmlns="http://www.w3.org/2005/Atom">
+			<entry><title type="html">say &lt;code&gt;what&lt;/code&gt;?</entry>
+		</feed>
+	`))
+	have := feed.Items[0].Title
+	want := "say what?"
+	if !reflect.DeepEqual(want, have) {
+		t.Logf("want: %#v", want)
+		t.Logf("have: %#v", have)
+		t.FailNow()
+	}
+}
--- a/src/storage/item.go
+++ b/src/storage/item.go
@@ -7,6 +7,8 @@ import (
 	"log"
 	"strings"
 	"time"
+
+	"github.com/nkanaev/yarr/src/content/htmlutil"
 )

 type ItemStatus int
@@ -322,7 +324,7 @@ func (s *Storage) SyncSearch() {
 	for _, item := range items {
 		result, err := s.db.Exec(`
 			insert into search (title, description, content) values (?, ?, ?)`,
-			item.Title, HTMLText(item.Description), HTMLText(item.Content),
+			item.Title, htmlutil.ExtractText(item.Description), htmlutil.ExtractText(item.Content),
 		)
 		if err != nil {
 			log.Print(err)
--- a/src/storage/utils.go
+++ b/src/storage/utils.go
@@ -1,24 +0,0 @@
-package storage
-
-import (
-	"strings"
-	"golang.org/x/net/html"
-)
-
-func HTMLText(s string) string {
-	tokenizer := html.NewTokenizer(strings.NewReader(s))
-	contents := make([]string, 0)
-	for {
-		token := tokenizer.Next()
-		if token == html.ErrorToken {
-			break
-		}
-		if token == html.TextToken {
-			content := strings.TrimSpace(html.UnescapeString(string(tokenizer.Text())))
-			if len(content) > 0 {
-				contents = append(contents, content)
-			}
-		}
-	}
-	return strings.Join(contents, " ")
-}