extract data from media elements

2025-12-20 18:54:03 +00:00 · 2021-03-25 16:55:19 +00:00
parent fe1a1987bd
commit 6685bce51c
4 changed files with 70 additions and 8 deletions
--- a/src/parser/atom.go
+++ b/src/parser/atom.go
@@ -24,6 +24,7 @@ type atomEntry struct {
 	Updated   string    `xml:"updated"`
 	Links     atomLinks `xml:"link"`
 	Content   atomText  `xml:"content"`
+	media
 }

 type atomText struct {
@@ -69,17 +70,14 @@ func ParseAtom(r io.Reader) (*Feed, error) {
 		SiteURL: firstNonEmpty(srcfeed.Links.First("alternate"), srcfeed.Links.First("")),
 	}
 	for _, srcitem := range srcfeed.Entries {
-		imageUrl := ""
-		podcastUrl := ""
-
 		dstfeed.Items = append(dstfeed.Items, Item{
 			GUID:       firstNonEmpty(srcitem.ID),
 			Date:       dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
 			URL:        firstNonEmpty(srcitem.Links.First("alternate"), srcfeed.Links.First("")),
 			Title:      srcitem.Title.String(),
-			Content:    srcitem.Content.String(),
-			ImageURL:   imageUrl,
-			PodcastURL: podcastUrl,
+			Content:    firstNonEmpty(srcitem.Content.String(), srcitem.firstMediaDescription()),
+			ImageURL:   srcitem.firstMediaThumbnail(),
+			PodcastURL: "",
 		})
 	}
 	return dstfeed, nil
--- a/src/parser/media.go
+++ b/src/parser/media.go
@@ -0,0 +1,53 @@
+package parser
+
+type media struct {
+	MediaGroups []mediaGroup `xml:"http://search.yahoo.com/mrss/ group"`
+
+	MediaThumbnails   []mediaThumbnail   `xml:"http://search.yahoo.com/mrss/ thumbnail"`
+	MediaDescriptions []mediaDescription `xml:"http://search.yahoo.com/mrss/ description"`
+}
+
+type mediaGroup struct {
+	MediaThumbnails   []mediaThumbnail   `xml:"http://search.yahoo.com/mrss/ thumbnail"`
+	MediaDescriptions []mediaDescription `xml:"http://search.yahoo.com/mrss/ description"`
+}
+
+type mediaContent struct {
+	URL      string `xml:"url,attr"`
+	Type     string `xml:"type,attr"`
+	FileSize string `xml:"fileSize,attr"`
+	Medium   string `xml:"medium,attr"`
+}
+
+type mediaThumbnail struct {
+	URL string `xml:"url,attr"`
+}
+
+type mediaDescription struct {
+	Type        string `xml:"type,attr"`
+	Description string `xml:",chardata"`
+}
+
+func (m *media) firstMediaThumbnail() string {
+	for _, t := range m.MediaThumbnails {
+		return t.URL
+	}
+	for _, g := range m.MediaGroups {
+		for _, t := range g.MediaThumbnails {
+			return t.URL
+		}
+	}
+	return ""
+}
+
+func (m *media) firstMediaDescription() string {
+	for _, d := range m.MediaDescriptions {
+		return plain2html(d.Description)
+	}
+	for _, g := range m.MediaGroups {
+		for _, d := range g.MediaDescriptions {
+			return plain2html(d.Description)
+		}
+	}
+	return ""
+}
--- a/src/parser/utils.go
+++ b/src/parser/utils.go
@@ -3,8 +3,11 @@ package parser
 import (
 	"encoding/xml"
 	"io"
-	"golang.org/x/net/html/charset"
+	"regexp"
+	"strings"
 	"time"
+
+	"golang.org/x/net/html/charset"
 )

 func firstNonEmpty(vals ...string) string {
@@ -16,6 +19,14 @@ func firstNonEmpty(vals ...string) string {
 	return ""
 }

+var linkRe = regexp.MustCompile(`(https?:\/\/\S+)`)
+
+func plain2html(text string) string {
+	text = linkRe.ReplaceAllString(text, `<a href="$1">$1</a>`)
+	text = strings.ReplaceAll(text, "\n", "<br>")
+	return text
+}
+
 func xmlDecoder(r io.Reader) *xml.Decoder {
 	decoder := xml.NewDecoder(r)
 	decoder.Strict = false