diff --git a/src/content/htmlutil/utils.go b/src/content/htmlutil/utils.go index df26445..3e475f8 100644 --- a/src/content/htmlutil/utils.go +++ b/src/content/htmlutil/utils.go @@ -1,6 +1,7 @@ package htmlutil import ( + "bytes" "strings" "golang.org/x/net/html" @@ -39,3 +40,18 @@ func Text(node *html.Node) string { } return strings.Join(text, " ") } + +func ExtractText(content string) string { + tokenizer := html.NewTokenizer(strings.NewReader(content)) + buffer := bytes.Buffer{} + for { + token := tokenizer.Next() + if token == html.ErrorToken { + break + } + if token == html.TextToken { + buffer.WriteString(html.UnescapeString(string(tokenizer.Text()))) + } + } + return buffer.String() +} diff --git a/src/parser/atom.go b/src/parser/atom.go index 31856ec..43b7d91 100644 --- a/src/parser/atom.go +++ b/src/parser/atom.go @@ -6,6 +6,8 @@ import ( "html" "io" "strings" + + "github.com/nkanaev/yarr/src/content/htmlutil" ) type atomFeed struct { @@ -42,6 +44,13 @@ type atomLink struct { type atomLinks []atomLink +func (a *atomText) Text() string { + if a.Type == "html" { + return htmlutil.ExtractText(a.Data) + } + return a.Data +} + func (a *atomText) String() string { data := a.Data if a.Type == "xhtml" { @@ -76,7 +85,7 @@ func ParseAtom(r io.Reader) (*Feed, error) { GUID: firstNonEmpty(srcitem.ID), Date: dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)), URL: firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")), - Title: srcitem.Title.String(), + Title: srcitem.Title.Text(), Content: firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()), ImageURL: srcitem.firstMediaThumbnail(), AudioURL: "", diff --git a/src/parser/atom_test.go b/src/parser/atom_test.go index de94918..9b1c423 100644 --- a/src/parser/atom_test.go +++ b/src/parser/atom_test.go @@ -77,3 +77,19 @@ func TestAtomClashingNamespaces(t *testing.T) { t.FailNow() } } + +func TestAtomHTMLTitle(t *testing.T) { + feed, _ := Parse(strings.NewReader(` + + + say <code>what</code>?</entry> + </feed> + `)) + have := feed.Items[0].Title + want := "say what?" + if !reflect.DeepEqual(want, have) { + t.Logf("want: %#v", want) + t.Logf("have: %#v", have) + t.FailNow() + } +} diff --git a/src/storage/item.go b/src/storage/item.go index 34faa42..e0e99b3 100644 --- a/src/storage/item.go +++ b/src/storage/item.go @@ -7,6 +7,8 @@ import ( "log" "strings" "time" + + "github.com/nkanaev/yarr/src/content/htmlutil" ) type ItemStatus int @@ -322,7 +324,7 @@ func (s *Storage) SyncSearch() { for _, item := range items { result, err := s.db.Exec(` insert into search (title, description, content) values (?, ?, ?)`, - item.Title, HTMLText(item.Description), HTMLText(item.Content), + item.Title, htmlutil.ExtractText(item.Description), htmlutil.ExtractText(item.Content), ) if err != nil { log.Print(err) diff --git a/src/storage/utils.go b/src/storage/utils.go deleted file mode 100644 index 2e8bad0..0000000 --- a/src/storage/utils.go +++ /dev/null @@ -1,24 +0,0 @@ -package storage - -import ( - "strings" - "golang.org/x/net/html" -) - -func HTMLText(s string) string { - tokenizer := html.NewTokenizer(strings.NewReader(s)) - contents := make([]string, 0) - for { - token := tokenizer.Next() - if token == html.ErrorToken { - break - } - if token == html.TextToken { - content := strings.TrimSpace(html.UnescapeString(string(tokenizer.Text()))) - if len(content) > 0 { - contents = append(contents, content) - } - } - } - return strings.Join(contents, " ") -}