From 63ad9718908a9149e1979eac341b7590c9036ca5 Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Sun, 4 Apr 2021 21:31:25 +0100 Subject: [PATCH] unsset audio/image if present in the content --- src/parser/atom_test.go | 38 ++++++++++++++++++++++++++++++++++ src/parser/feed.go | 7 +++++++ src/parser/rss.go | 4 +--- src/parser/rss_test.go | 45 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 3 deletions(-) diff --git a/src/parser/atom_test.go b/src/parser/atom_test.go index 9b1c423..fa01297 100644 --- a/src/parser/atom_test.go +++ b/src/parser/atom_test.go @@ -93,3 +93,41 @@ func TestAtomHTMLTitle(t *testing.T) { t.FailNow() } } + +func TestAtomImageLink(t *testing.T) { + feed, _ := Parse(strings.NewReader(` + + + + + + + `)) + have := feed.Items[0].ImageURL + want := `https://example.com/image.png?width=100&height=100` + if want != have { + t.Fatalf("item.image_url doesn't match\nwant: %#v\nhave: %#v\n", want, have) + } +} + +// found in: https://www.reddit.com/r/funny.rss +// items come with thumbnail urls which are also present in the content +func TestAtomImageLinkDuplicated(t *testing.T) { + feed, _ := Parse(strings.NewReader(` + + + + <img src="https://example.com/image.png?width=100&height=100"> + + + + `)) + have := feed.Items[0].Content + want := `` + if want != have { + t.Fatalf("want: %#v\nhave: %#v\n", want, have) + } + if feed.Items[0].ImageURL != "" { + t.Fatal("item.image_url must be unset if present in the content") + } +} diff --git a/src/parser/feed.go b/src/parser/feed.go index 504ed5d..925d8c1 100644 --- a/src/parser/feed.go +++ b/src/parser/feed.go @@ -75,6 +75,13 @@ func (feed *Feed) cleanup() { feed.Items[i].URL = strings.TrimSpace(item.URL) feed.Items[i].Title = strings.TrimSpace(item.Title) feed.Items[i].Content = strings.TrimSpace(item.Content) + + if item.ImageURL != "" && strings.Contains(item.Content, item.ImageURL) { + feed.Items[i].ImageURL = "" + } + if item.AudioURL != "" && strings.Contains(item.Content, item.AudioURL) { + feed.Items[i].AudioURL = "" + } } } diff --git a/src/parser/rss.go b/src/parser/rss.go index 114d3c0..01b6829 100644 --- a/src/parser/rss.go +++ b/src/parser/rss.go @@ -77,11 +77,9 @@ func ParseRSS(r io.Reader) (*Feed, error) { if e.Type == "audio/mpeg" || e.Type == "audio/x-m4a" { podcastURL = e.URL - origBase := path.Base(srcitem.OrigEnclosureLink) - if origBase != "" && strings.Contains(podcastURL, origBase) { + if srcitem.OrigEnclosureLink != "" && strings.Contains(podcastURL, path.Base(srcitem.OrigEnclosureLink)) { podcastURL = srcitem.OrigEnclosureLink } - break } } diff --git a/src/parser/rss_test.go b/src/parser/rss_test.go index 0baefc8..58e9c5d 100644 --- a/src/parser/rss_test.go +++ b/src/parser/rss_test.go @@ -115,3 +115,48 @@ func TestRSSWithLotsOfSpaces(t *testing.T) { t.FailNow() } } + +func TestRSSPodcast(t *testing.T) { + feed, _ := Parse(strings.NewReader(` + + + + + + + + + `)) + have := feed.Items[0].AudioURL + want := "http://example.com/audio.ext" + if want != have { + t.Logf("want: %#v", want) + t.Logf("have: %#v", have) + t.FailNow() + } +} + +// found in: https://podcast.cscript.site/podcast.xml +func TestRSSPodcastDuplicated(t *testing.T) { + feed, _ := Parse(strings.NewReader(` + + + + + + ]]> + + + + + + `)) + have := feed.Items[0].Content + want := `` + if want != have { + t.Fatalf("content doesn't match\nwant: %#v\nhave: %#v\n", want, have) + } + if feed.Items[0].AudioURL != "" { + t.Fatal("item.audio_url must be unset if present in the content") + } +}