diff --git a/src/content/htmlutil/utils.go b/src/content/htmlutil/utils.go
index df26445..3e475f8 100644
--- a/src/content/htmlutil/utils.go
+++ b/src/content/htmlutil/utils.go
@@ -1,6 +1,7 @@
package htmlutil
import (
+ "bytes"
"strings"
"golang.org/x/net/html"
@@ -39,3 +40,18 @@ func Text(node *html.Node) string {
}
return strings.Join(text, " ")
}
+
+func ExtractText(content string) string {
+ tokenizer := html.NewTokenizer(strings.NewReader(content))
+ buffer := bytes.Buffer{}
+ for {
+ token := tokenizer.Next()
+ if token == html.ErrorToken {
+ break
+ }
+ if token == html.TextToken {
+ buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
+ }
+ }
+ return buffer.String()
+}
diff --git a/src/parser/atom.go b/src/parser/atom.go
index 31856ec..43b7d91 100644
--- a/src/parser/atom.go
+++ b/src/parser/atom.go
@@ -6,6 +6,8 @@ import (
"html"
"io"
"strings"
+
+ "github.com/nkanaev/yarr/src/content/htmlutil"
)
type atomFeed struct {
@@ -42,6 +44,13 @@ type atomLink struct {
type atomLinks []atomLink
+func (a *atomText) Text() string {
+ if a.Type == "html" {
+ return htmlutil.ExtractText(a.Data)
+ }
+ return a.Data
+}
+
func (a *atomText) String() string {
data := a.Data
if a.Type == "xhtml" {
@@ -76,7 +85,7 @@ func ParseAtom(r io.Reader) (*Feed, error) {
GUID: firstNonEmpty(srcitem.ID),
Date: dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
URL: firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")),
- Title: srcitem.Title.String(),
+ Title: srcitem.Title.Text(),
Content: firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()),
ImageURL: srcitem.firstMediaThumbnail(),
AudioURL: "",
diff --git a/src/parser/atom_test.go b/src/parser/atom_test.go
index de94918..9b1c423 100644
--- a/src/parser/atom_test.go
+++ b/src/parser/atom_test.go
@@ -77,3 +77,19 @@ func TestAtomClashingNamespaces(t *testing.T) {
t.FailNow()
}
}
+
+func TestAtomHTMLTitle(t *testing.T) {
+ feed, _ := Parse(strings.NewReader(`
+
+
+ say <code>what</code>?
+
+ `))
+ have := feed.Items[0].Title
+ want := "say what?"
+ if !reflect.DeepEqual(want, have) {
+ t.Logf("want: %#v", want)
+ t.Logf("have: %#v", have)
+ t.FailNow()
+ }
+}
diff --git a/src/storage/item.go b/src/storage/item.go
index 34faa42..e0e99b3 100644
--- a/src/storage/item.go
+++ b/src/storage/item.go
@@ -7,6 +7,8 @@ import (
"log"
"strings"
"time"
+
+ "github.com/nkanaev/yarr/src/content/htmlutil"
)
type ItemStatus int
@@ -322,7 +324,7 @@ func (s *Storage) SyncSearch() {
for _, item := range items {
result, err := s.db.Exec(`
insert into search (title, description, content) values (?, ?, ?)`,
- item.Title, HTMLText(item.Description), HTMLText(item.Content),
+ item.Title, htmlutil.ExtractText(item.Description), htmlutil.ExtractText(item.Content),
)
if err != nil {
log.Print(err)
diff --git a/src/storage/utils.go b/src/storage/utils.go
deleted file mode 100644
index 2e8bad0..0000000
--- a/src/storage/utils.go
+++ /dev/null
@@ -1,24 +0,0 @@
-package storage
-
-import (
- "strings"
- "golang.org/x/net/html"
-)
-
-func HTMLText(s string) string {
- tokenizer := html.NewTokenizer(strings.NewReader(s))
- contents := make([]string, 0)
- for {
- token := tokenizer.Next()
- if token == html.ErrorToken {
- break
- }
- if token == html.TextToken {
- content := strings.TrimSpace(html.UnescapeString(string(tokenizer.Text())))
- if len(content) > 0 {
- contents = append(contents, content)
- }
- }
- }
- return strings.Join(contents, " ")
-}