mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
handle html type atom text
This commit is contained in:
parent
8967936fb6
commit
e50c7e1a51
@ -1,6 +1,7 @@
|
||||
package htmlutil
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
@ -39,3 +40,18 @@ func Text(node *html.Node) string {
|
||||
}
|
||||
return strings.Join(text, " ")
|
||||
}
|
||||
|
||||
func ExtractText(content string) string {
|
||||
tokenizer := html.NewTokenizer(strings.NewReader(content))
|
||||
buffer := bytes.Buffer{}
|
||||
for {
|
||||
token := tokenizer.Next()
|
||||
if token == html.ErrorToken {
|
||||
break
|
||||
}
|
||||
if token == html.TextToken {
|
||||
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
|
||||
}
|
||||
}
|
||||
return buffer.String()
|
||||
}
|
||||
|
@ -6,6 +6,8 @@ import (
|
||||
"html"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||
)
|
||||
|
||||
type atomFeed struct {
|
||||
@ -42,6 +44,13 @@ type atomLink struct {
|
||||
|
||||
type atomLinks []atomLink
|
||||
|
||||
func (a *atomText) Text() string {
|
||||
if a.Type == "html" {
|
||||
return htmlutil.ExtractText(a.Data)
|
||||
}
|
||||
return a.Data
|
||||
}
|
||||
|
||||
func (a *atomText) String() string {
|
||||
data := a.Data
|
||||
if a.Type == "xhtml" {
|
||||
@ -76,7 +85,7 @@ func ParseAtom(r io.Reader) (*Feed, error) {
|
||||
GUID: firstNonEmpty(srcitem.ID),
|
||||
Date: dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
|
||||
URL: firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")),
|
||||
Title: srcitem.Title.String(),
|
||||
Title: srcitem.Title.Text(),
|
||||
Content: firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()),
|
||||
ImageURL: srcitem.firstMediaThumbnail(),
|
||||
AudioURL: "",
|
||||
|
@ -77,3 +77,19 @@ func TestAtomClashingNamespaces(t *testing.T) {
|
||||
t.FailNow()
|
||||
}
|
||||
}
|
||||
|
||||
func TestAtomHTMLTitle(t *testing.T) {
|
||||
feed, _ := Parse(strings.NewReader(`
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry><title type="html">say <code>what</code>?</entry>
|
||||
</feed>
|
||||
`))
|
||||
have := feed.Items[0].Title
|
||||
want := "say what?"
|
||||
if !reflect.DeepEqual(want, have) {
|
||||
t.Logf("want: %#v", want)
|
||||
t.Logf("have: %#v", have)
|
||||
t.FailNow()
|
||||
}
|
||||
}
|
||||
|
@ -7,6 +7,8 @@ import (
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||
)
|
||||
|
||||
type ItemStatus int
|
||||
@ -322,7 +324,7 @@ func (s *Storage) SyncSearch() {
|
||||
for _, item := range items {
|
||||
result, err := s.db.Exec(`
|
||||
insert into search (title, description, content) values (?, ?, ?)`,
|
||||
item.Title, HTMLText(item.Description), HTMLText(item.Content),
|
||||
item.Title, htmlutil.ExtractText(item.Description), htmlutil.ExtractText(item.Content),
|
||||
)
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
|
@ -1,24 +0,0 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
func HTMLText(s string) string {
|
||||
tokenizer := html.NewTokenizer(strings.NewReader(s))
|
||||
contents := make([]string, 0)
|
||||
for {
|
||||
token := tokenizer.Next()
|
||||
if token == html.ErrorToken {
|
||||
break
|
||||
}
|
||||
if token == html.TextToken {
|
||||
content := strings.TrimSpace(html.UnescapeString(string(tokenizer.Text())))
|
||||
if len(content) > 0 {
|
||||
contents = append(contents, content)
|
||||
}
|
||||
}
|
||||
}
|
||||
return strings.Join(contents, " ")
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user