handle html type atom text

2025-05-24 00:33:14 +00:00 · 2021-04-02 21:46:23 +01:00 · 2021-04-02 21:46:23 +01:00 · e50c7e1a51
commit e50c7e1a51
parent 8967936fb6
5 changed files with 45 additions and 26 deletions
--- a/src/content/htmlutil/utils.go
+++ b/src/content/htmlutil/utils.go
@ -1,6 +1,7 @@
 package htmlutil
 import (
 	"bytes"
 	"strings"
 	"golang.org/x/net/html"
@ -39,3 +40,18 @@ func Text(node *html.Node) string {
 	}
 	return strings.Join(text, " ")
 }
 func ExtractText(content string) string {
 	tokenizer := html.NewTokenizer(strings.NewReader(content))
 	buffer := bytes.Buffer{}
 	for {
 		token := tokenizer.Next()
 		if token == html.ErrorToken {
 			break
 		}
 		if token == html.TextToken {
 			buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
 		}
 	}
 	return buffer.String()
 }
--- a/src/parser/atom.go
+++ b/src/parser/atom.go
@ -6,6 +6,8 @@ import (
 	"html"
 	"io"
 	"strings"
 	"github.com/nkanaev/yarr/src/content/htmlutil"
 )
 type atomFeed struct {
@ -42,6 +44,13 @@ type atomLink struct {
 type atomLinks []atomLink
 func (a *atomText) Text() string {
 	if a.Type == "html" {
 		return htmlutil.ExtractText(a.Data)
 	}
 	return a.Data
 }
 func (a *atomText) String() string {
 	data := a.Data
 	if a.Type == "xhtml" {
@ -76,7 +85,7 @@ func ParseAtom(r io.Reader) (*Feed, error) {
 			GUID:     firstNonEmpty(srcitem.ID),
 			Date:     dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
 			URL:      firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")),
-			Title:    srcitem.Title.String(),
+			Title:    srcitem.Title.Text(),
 			Content:  firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()),
 			ImageURL: srcitem.firstMediaThumbnail(),
 			AudioURL: "",
--- a/src/parser/atom_test.go
+++ b/src/parser/atom_test.go
@ -77,3 +77,19 @@ func TestAtomClashingNamespaces(t *testing.T) {
 		t.FailNow()
 	}
 }
 func TestAtomHTMLTitle(t *testing.T) {
 	feed, _ := Parse(strings.NewReader(`
 		<?xml version="1.0" encoding="utf-8"?>
 		<feed xmlns="http://www.w3.org/2005/Atom">
 			<entry><title type="html">say &lt;code&gt;what&lt;/code&gt;?</entry>
 		</feed>
 	`))
 	have := feed.Items[0].Title
 	want := "say what?"
 	if !reflect.DeepEqual(want, have) {
 		t.Logf("want: %#v", want)
 		t.Logf("have: %#v", have)
 		t.FailNow()
 	}
 }
--- a/src/storage/item.go
+++ b/src/storage/item.go
@ -7,6 +7,8 @@ import (
 	"log"
 	"strings"
 	"time"
 	"github.com/nkanaev/yarr/src/content/htmlutil"
 )
 type ItemStatus int
@ -322,7 +324,7 @@ func (s *Storage) SyncSearch() {
 	for _, item := range items {
 		result, err := s.db.Exec(`
 			insert into search (title, description, content) values (?, ?, ?)`,
-			item.Title, HTMLText(item.Description), HTMLText(item.Content),
+			item.Title, htmlutil.ExtractText(item.Description), htmlutil.ExtractText(item.Content),
 		)
 		if err != nil {
 			log.Print(err)
--- a/src/storage/utils.go
+++ b/src/storage/utils.go
@ -1,24 +0,0 @@
 package storage
 import (
 	"strings"
 	"golang.org/x/net/html"
 )
 func HTMLText(s string) string {
 	tokenizer := html.NewTokenizer(strings.NewReader(s))
 	contents := make([]string, 0)
 	for {
 		token := tokenizer.Next()
 		if token == html.ErrorToken {
 			break
 		}
 		if token == html.TextToken {
 			content := strings.TrimSpace(html.UnescapeString(string(tokenizer.Text())))
 			if len(content) > 0 {
 				contents = append(contents, content)
 			}
 		}
 	}
 	return strings.Join(contents, " ")
 }