handle html type atom text

This commit is contained in:
Nazar Kanaev 2021-04-02 21:46:23 +01:00
parent 8967936fb6
commit e50c7e1a51
5 changed files with 45 additions and 26 deletions

View File

@ -1,6 +1,7 @@
package htmlutil package htmlutil
import ( import (
"bytes"
"strings" "strings"
"golang.org/x/net/html" "golang.org/x/net/html"
@ -39,3 +40,18 @@ func Text(node *html.Node) string {
} }
return strings.Join(text, " ") return strings.Join(text, " ")
} }
func ExtractText(content string) string {
tokenizer := html.NewTokenizer(strings.NewReader(content))
buffer := bytes.Buffer{}
for {
token := tokenizer.Next()
if token == html.ErrorToken {
break
}
if token == html.TextToken {
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
}
}
return buffer.String()
}

View File

@ -6,6 +6,8 @@ import (
"html" "html"
"io" "io"
"strings" "strings"
"github.com/nkanaev/yarr/src/content/htmlutil"
) )
type atomFeed struct { type atomFeed struct {
@ -42,6 +44,13 @@ type atomLink struct {
type atomLinks []atomLink type atomLinks []atomLink
func (a *atomText) Text() string {
if a.Type == "html" {
return htmlutil.ExtractText(a.Data)
}
return a.Data
}
func (a *atomText) String() string { func (a *atomText) String() string {
data := a.Data data := a.Data
if a.Type == "xhtml" { if a.Type == "xhtml" {
@ -76,7 +85,7 @@ func ParseAtom(r io.Reader) (*Feed, error) {
GUID: firstNonEmpty(srcitem.ID), GUID: firstNonEmpty(srcitem.ID),
Date: dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)), Date: dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
URL: firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")), URL: firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")),
Title: srcitem.Title.String(), Title: srcitem.Title.Text(),
Content: firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()), Content: firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()),
ImageURL: srcitem.firstMediaThumbnail(), ImageURL: srcitem.firstMediaThumbnail(),
AudioURL: "", AudioURL: "",

View File

@ -77,3 +77,19 @@ func TestAtomClashingNamespaces(t *testing.T) {
t.FailNow() t.FailNow()
} }
} }
func TestAtomHTMLTitle(t *testing.T) {
feed, _ := Parse(strings.NewReader(`
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry><title type="html">say &lt;code&gt;what&lt;/code&gt;?</entry>
</feed>
`))
have := feed.Items[0].Title
want := "say what?"
if !reflect.DeepEqual(want, have) {
t.Logf("want: %#v", want)
t.Logf("have: %#v", have)
t.FailNow()
}
}

View File

@ -7,6 +7,8 @@ import (
"log" "log"
"strings" "strings"
"time" "time"
"github.com/nkanaev/yarr/src/content/htmlutil"
) )
type ItemStatus int type ItemStatus int
@ -322,7 +324,7 @@ func (s *Storage) SyncSearch() {
for _, item := range items { for _, item := range items {
result, err := s.db.Exec(` result, err := s.db.Exec(`
insert into search (title, description, content) values (?, ?, ?)`, insert into search (title, description, content) values (?, ?, ?)`,
item.Title, HTMLText(item.Description), HTMLText(item.Content), item.Title, htmlutil.ExtractText(item.Description), htmlutil.ExtractText(item.Content),
) )
if err != nil { if err != nil {
log.Print(err) log.Print(err)

View File

@ -1,24 +0,0 @@
package storage
import (
"strings"
"golang.org/x/net/html"
)
func HTMLText(s string) string {
tokenizer := html.NewTokenizer(strings.NewReader(s))
contents := make([]string, 0)
for {
token := tokenizer.Next()
if token == html.ErrorToken {
break
}
if token == html.TextToken {
content := strings.TrimSpace(html.UnescapeString(string(tokenizer.Text())))
if len(content) > 0 {
contents = append(contents, content)
}
}
}
return strings.Join(contents, " ")
}