mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
handle html type atom text
This commit is contained in:
parent
8967936fb6
commit
e50c7e1a51
@ -1,6 +1,7 @@
|
|||||||
package htmlutil
|
package htmlutil
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
@ -39,3 +40,18 @@ func Text(node *html.Node) string {
|
|||||||
}
|
}
|
||||||
return strings.Join(text, " ")
|
return strings.Join(text, " ")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ExtractText(content string) string {
|
||||||
|
tokenizer := html.NewTokenizer(strings.NewReader(content))
|
||||||
|
buffer := bytes.Buffer{}
|
||||||
|
for {
|
||||||
|
token := tokenizer.Next()
|
||||||
|
if token == html.ErrorToken {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if token == html.TextToken {
|
||||||
|
buffer.WriteString(html.UnescapeString(string(tokenizer.Text())))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return buffer.String()
|
||||||
|
}
|
||||||
|
@ -6,6 +6,8 @@ import (
|
|||||||
"html"
|
"html"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
type atomFeed struct {
|
type atomFeed struct {
|
||||||
@ -42,6 +44,13 @@ type atomLink struct {
|
|||||||
|
|
||||||
type atomLinks []atomLink
|
type atomLinks []atomLink
|
||||||
|
|
||||||
|
func (a *atomText) Text() string {
|
||||||
|
if a.Type == "html" {
|
||||||
|
return htmlutil.ExtractText(a.Data)
|
||||||
|
}
|
||||||
|
return a.Data
|
||||||
|
}
|
||||||
|
|
||||||
func (a *atomText) String() string {
|
func (a *atomText) String() string {
|
||||||
data := a.Data
|
data := a.Data
|
||||||
if a.Type == "xhtml" {
|
if a.Type == "xhtml" {
|
||||||
@ -76,7 +85,7 @@ func ParseAtom(r io.Reader) (*Feed, error) {
|
|||||||
GUID: firstNonEmpty(srcitem.ID),
|
GUID: firstNonEmpty(srcitem.ID),
|
||||||
Date: dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
|
Date: dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
|
||||||
URL: firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")),
|
URL: firstNonEmpty(srcitem.OrigLink, srcitem.Links.First("alternate"), srcitem.Links.First("")),
|
||||||
Title: srcitem.Title.String(),
|
Title: srcitem.Title.Text(),
|
||||||
Content: firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()),
|
Content: firstNonEmpty(srcitem.Content.String(), srcitem.Summary.String(), srcitem.firstMediaDescription()),
|
||||||
ImageURL: srcitem.firstMediaThumbnail(),
|
ImageURL: srcitem.firstMediaThumbnail(),
|
||||||
AudioURL: "",
|
AudioURL: "",
|
||||||
|
@ -77,3 +77,19 @@ func TestAtomClashingNamespaces(t *testing.T) {
|
|||||||
t.FailNow()
|
t.FailNow()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAtomHTMLTitle(t *testing.T) {
|
||||||
|
feed, _ := Parse(strings.NewReader(`
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry><title type="html">say <code>what</code>?</entry>
|
||||||
|
</feed>
|
||||||
|
`))
|
||||||
|
have := feed.Items[0].Title
|
||||||
|
want := "say what?"
|
||||||
|
if !reflect.DeepEqual(want, have) {
|
||||||
|
t.Logf("want: %#v", want)
|
||||||
|
t.Logf("have: %#v", have)
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -7,6 +7,8 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ItemStatus int
|
type ItemStatus int
|
||||||
@ -322,7 +324,7 @@ func (s *Storage) SyncSearch() {
|
|||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
result, err := s.db.Exec(`
|
result, err := s.db.Exec(`
|
||||||
insert into search (title, description, content) values (?, ?, ?)`,
|
insert into search (title, description, content) values (?, ?, ?)`,
|
||||||
item.Title, HTMLText(item.Description), HTMLText(item.Content),
|
item.Title, htmlutil.ExtractText(item.Description), htmlutil.ExtractText(item.Content),
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
package storage
|
|
||||||
|
|
||||||
import (
|
|
||||||
"strings"
|
|
||||||
"golang.org/x/net/html"
|
|
||||||
)
|
|
||||||
|
|
||||||
func HTMLText(s string) string {
|
|
||||||
tokenizer := html.NewTokenizer(strings.NewReader(s))
|
|
||||||
contents := make([]string, 0)
|
|
||||||
for {
|
|
||||||
token := tokenizer.Next()
|
|
||||||
if token == html.ErrorToken {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if token == html.TextToken {
|
|
||||||
content := strings.TrimSpace(html.UnescapeString(string(tokenizer.Text())))
|
|
||||||
if len(content) > 0 {
|
|
||||||
contents = append(contents, content)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return strings.Join(contents, " ")
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user