remove html tags from titles

This commit is contained in:
nkanaev 2021-11-10 10:54:12 +00:00
parent 77c7f938f1
commit 26b87dee98
3 changed files with 26 additions and 3 deletions

View File

@ -1,5 +1,3 @@
- strip out html tags from titles
https://www.aldaily.com/feed/
- feedlist keyboard navigation is flaky in "unread" section
- windows cli mode not working
incorporate changes from:

View File

@ -9,6 +9,8 @@ import (
"net/url"
"strings"
"time"
"github.com/nkanaev/yarr/src/content/htmlutil"
)
var UnknownFormat = errors.New("unknown feed format")
@ -80,7 +82,7 @@ func (feed *Feed) cleanup() {
for i, item := range feed.Items {
feed.Items[i].GUID = strings.TrimSpace(item.GUID)
feed.Items[i].URL = strings.TrimSpace(item.URL)
feed.Items[i].Title = strings.TrimSpace(item.Title)
feed.Items[i].Title = strings.TrimSpace(htmlutil.ExtractText(item.Title))
feed.Items[i].Content = strings.TrimSpace(item.Content)
if item.ImageURL != "" && strings.Contains(item.Content, item.ImageURL) {

View File

@ -180,3 +180,26 @@ func TestRSSPodcastDuplicated(t *testing.T) {
t.Fatal("item.audio_url must be unset if present in the content")
}
}
func TestRSSTitleHTMLTags(t *testing.T) {
feed, _ := Parse(strings.NewReader(`
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<item>
<title>&lt;p&gt;title in p&lt;/p&gt;</title>
</item>
<item>
<title>very &lt;strong&gt;strong&lt;/strong&gt; title</title>
</item>
</channel>
</rss>
`))
have := []string{feed.Items[0].Title, feed.Items[1].Title}
want := []string{"title in p", "very strong title"}
for i := 0; i < len(want); i++ {
if want[i] != have[i] {
t.Errorf("title doesn't match\nwant: %#v\nhave: %#v\n", want[i], have[i])
}
}
}