extract data from media elements

This commit is contained in:
Nazar Kanaev 2021-03-25 16:55:19 +00:00
parent fe1a1987bd
commit 6685bce51c
4 changed files with 70 additions and 8 deletions

View File

@ -24,6 +24,7 @@ type atomEntry struct {
Updated string `xml:"updated"`
Links atomLinks `xml:"link"`
Content atomText `xml:"content"`
media
}
type atomText struct {
@ -69,17 +70,14 @@ func ParseAtom(r io.Reader) (*Feed, error) {
SiteURL: firstNonEmpty(srcfeed.Links.First("alternate"), srcfeed.Links.First("")),
}
for _, srcitem := range srcfeed.Entries {
imageUrl := ""
podcastUrl := ""
dstfeed.Items = append(dstfeed.Items, Item{
GUID: firstNonEmpty(srcitem.ID),
Date: dateParse(firstNonEmpty(srcitem.Published, srcitem.Updated)),
URL: firstNonEmpty(srcitem.Links.First("alternate"), srcfeed.Links.First("")),
Title: srcitem.Title.String(),
Content: srcitem.Content.String(),
ImageURL: imageUrl,
PodcastURL: podcastUrl,
Content: firstNonEmpty(srcitem.Content.String(), srcitem.firstMediaDescription()),
ImageURL: srcitem.firstMediaThumbnail(),
PodcastURL: "",
})
}
return dstfeed, nil

53
src/parser/media.go Normal file
View File

@ -0,0 +1,53 @@
package parser
type media struct {
MediaGroups []mediaGroup `xml:"http://search.yahoo.com/mrss/ group"`
MediaThumbnails []mediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
MediaDescriptions []mediaDescription `xml:"http://search.yahoo.com/mrss/ description"`
}
type mediaGroup struct {
MediaThumbnails []mediaThumbnail `xml:"http://search.yahoo.com/mrss/ thumbnail"`
MediaDescriptions []mediaDescription `xml:"http://search.yahoo.com/mrss/ description"`
}
type mediaContent struct {
URL string `xml:"url,attr"`
Type string `xml:"type,attr"`
FileSize string `xml:"fileSize,attr"`
Medium string `xml:"medium,attr"`
}
type mediaThumbnail struct {
URL string `xml:"url,attr"`
}
type mediaDescription struct {
Type string `xml:"type,attr"`
Description string `xml:",chardata"`
}
func (m *media) firstMediaThumbnail() string {
for _, t := range m.MediaThumbnails {
return t.URL
}
for _, g := range m.MediaGroups {
for _, t := range g.MediaThumbnails {
return t.URL
}
}
return ""
}
func (m *media) firstMediaDescription() string {
for _, d := range m.MediaDescriptions {
return plain2html(d.Description)
}
for _, g := range m.MediaGroups {
for _, d := range g.MediaDescriptions {
return plain2html(d.Description)
}
}
return ""
}

View File

@ -3,8 +3,11 @@ package parser
import (
"encoding/xml"
"io"
"golang.org/x/net/html/charset"
"regexp"
"strings"
"time"
"golang.org/x/net/html/charset"
)
func firstNonEmpty(vals ...string) string {
@ -16,6 +19,14 @@ func firstNonEmpty(vals ...string) string {
return ""
}
var linkRe = regexp.MustCompile(`(https?:\/\/\S+)`)
func plain2html(text string) string {
text = linkRe.ReplaceAllString(text, `<a href="$1">$1</a>`)
text = strings.ReplaceAll(text, "\n", "<br>")
return text
}
func xmlDecoder(r io.Reader) *xml.Decoder {
decoder := xml.NewDecoder(r)
decoder.Strict = false