handle html type atom text

This commit is contained in:
Nazar Kanaev
2021-04-02 21:46:23 +01:00
parent 8967936fb6
commit e50c7e1a51
5 changed files with 45 additions and 26 deletions

View File

@@ -7,6 +7,8 @@ import (
"log"
"strings"
"time"
"github.com/nkanaev/yarr/src/content/htmlutil"
)
type ItemStatus int
@@ -322,7 +324,7 @@ func (s *Storage) SyncSearch() {
for _, item := range items {
result, err := s.db.Exec(`
insert into search (title, description, content) values (?, ?, ?)`,
item.Title, HTMLText(item.Description), HTMLText(item.Content),
item.Title, htmlutil.ExtractText(item.Description), htmlutil.ExtractText(item.Content),
)
if err != nil {
log.Print(err)

View File

@@ -1,24 +0,0 @@
package storage
import (
"strings"
"golang.org/x/net/html"
)
func HTMLText(s string) string {
tokenizer := html.NewTokenizer(strings.NewReader(s))
contents := make([]string, 0)
for {
token := tokenizer.Next()
if token == html.ErrorToken {
break
}
if token == html.TextToken {
content := strings.TrimSpace(html.UnescapeString(string(tokenizer.Text())))
if len(content) > 0 {
contents = append(contents, content)
}
}
}
return strings.Join(contents, " ")
}