mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
found in: https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml when both rss and atom link elements are present, xml parser returns empty string. provide default namespace to capture rss link properly.
96 lines
2.4 KiB
Go
96 lines
2.4 KiB
Go
// Parser for RSS versions:
|
|
// - 0.91 netscape
|
|
// - 0.91 userland
|
|
// - 2.0
|
|
package parser
|
|
|
|
import (
|
|
"encoding/xml"
|
|
"io"
|
|
"path"
|
|
"strings"
|
|
)
|
|
|
|
type rssFeed struct {
|
|
XMLName xml.Name `xml:"rss"`
|
|
Version string `xml:"version,attr"`
|
|
Title string `xml:"channel>title"`
|
|
Link string `xml:"channel>link"`
|
|
Items []rssItem `xml:"channel>item"`
|
|
}
|
|
|
|
type rssItem struct {
|
|
GUID string `xml:"guid"`
|
|
Title string `xml:"title"`
|
|
Link string `xml:"rss link"`
|
|
Description string `xml:"rss description"`
|
|
PubDate string `xml:"pubDate"`
|
|
Enclosures []rssEnclosure `xml:"enclosure"`
|
|
|
|
DublinCoreDate string `xml:"http://purl.org/dc/elements/1.1/ date"`
|
|
ContentEncoded string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
|
|
|
|
OrigLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"`
|
|
OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"`
|
|
|
|
media
|
|
}
|
|
|
|
type rssLink struct {
|
|
XMLName xml.Name
|
|
Data string `xml:",chardata"`
|
|
Href string `xml:"href,attr"`
|
|
Rel string `xml:"rel,attr"`
|
|
}
|
|
|
|
type rssTitle struct {
|
|
XMLName xml.Name
|
|
Data string `xml:",chardata"`
|
|
Inner string `xml:",innerxml"`
|
|
}
|
|
|
|
type rssEnclosure struct {
|
|
URL string `xml:"url,attr"`
|
|
Type string `xml:"type,attr"`
|
|
Length string `xml:"length,attr"`
|
|
}
|
|
|
|
func ParseRSS(r io.Reader) (*Feed, error) {
|
|
srcfeed := rssFeed{}
|
|
|
|
decoder := xmlDecoder(r)
|
|
decoder.DefaultSpace = "rss"
|
|
if err := decoder.Decode(&srcfeed); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
dstfeed := &Feed{
|
|
Title: srcfeed.Title,
|
|
SiteURL: srcfeed.Link,
|
|
}
|
|
for _, srcitem := range srcfeed.Items {
|
|
podcastURL := ""
|
|
for _, e := range srcitem.Enclosures {
|
|
if strings.HasPrefix(e.Type, "audio/") {
|
|
podcastURL = e.URL
|
|
|
|
if srcitem.OrigEnclosureLink != "" && strings.Contains(podcastURL, path.Base(srcitem.OrigEnclosureLink)) {
|
|
podcastURL = srcitem.OrigEnclosureLink
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
dstfeed.Items = append(dstfeed.Items, Item{
|
|
GUID: firstNonEmpty(srcitem.GUID, srcitem.Link),
|
|
Date: dateParse(firstNonEmpty(srcitem.DublinCoreDate, srcitem.PubDate)),
|
|
URL: firstNonEmpty(srcitem.OrigLink, srcitem.Link),
|
|
Title: srcitem.Title,
|
|
Content: firstNonEmpty(srcitem.ContentEncoded, srcitem.Description),
|
|
AudioURL: podcastURL,
|
|
ImageURL: srcitem.firstMediaThumbnail(),
|
|
})
|
|
}
|
|
return dstfeed, nil
|
|
}
|