mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
fix encoding
This commit is contained in:
parent
e3e9542f1e
commit
52cc8ecbbd
@ -11,20 +11,23 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/nkanaev/yarr/src/content/htmlutil"
|
"github.com/nkanaev/yarr/src/content/htmlutil"
|
||||||
|
"golang.org/x/net/html/charset"
|
||||||
)
|
)
|
||||||
|
|
||||||
var UnknownFormat = errors.New("unknown feed format")
|
var UnknownFormat = errors.New("unknown feed format")
|
||||||
|
|
||||||
type processor func(r io.Reader) (*Feed, error)
|
type processor func(r io.Reader) (*Feed, error)
|
||||||
|
|
||||||
func sniff(lookup string) (string, processor) {
|
func sniff(lookup string) (string, bool, processor) {
|
||||||
lookup = strings.TrimSpace(lookup)
|
lookup = strings.TrimSpace(lookup)
|
||||||
lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF")
|
lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF")
|
||||||
|
|
||||||
if len(lookup) < 0 {
|
if len(lookup) == 0 {
|
||||||
return "", nil
|
return "", false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var decode bool
|
||||||
|
|
||||||
switch lookup[0] {
|
switch lookup[0] {
|
||||||
case '<':
|
case '<':
|
||||||
decoder := xmlDecoder(strings.NewReader(lookup))
|
decoder := xmlDecoder(strings.NewReader(lookup))
|
||||||
@ -33,24 +36,32 @@ func sniff(lookup string) (string, processor) {
|
|||||||
if token == nil {
|
if token == nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
// check for absence of xml encoding <?xml encoding="ENCODING" ?>
|
||||||
|
if el, ok := token.(xml.ProcInst); ok && el.Target == "xml" {
|
||||||
|
decode = strings.Index(string(el.Inst), "encoding=") == -1
|
||||||
|
}
|
||||||
if el, ok := token.(xml.StartElement); ok {
|
if el, ok := token.(xml.StartElement); ok {
|
||||||
switch el.Name.Local {
|
switch el.Name.Local {
|
||||||
case "rss":
|
case "rss":
|
||||||
return "rss", ParseRSS
|
return "rss", decode, ParseRSS
|
||||||
case "RDF":
|
case "RDF":
|
||||||
return "rdf", ParseRDF
|
return "rdf", decode, ParseRDF
|
||||||
case "feed":
|
case "feed":
|
||||||
return "atom", ParseAtom
|
return "atom", decode, ParseAtom
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case '{':
|
case '{':
|
||||||
return "json", ParseJSON
|
return "json", true, ParseJSON
|
||||||
}
|
}
|
||||||
return "", nil
|
return "", false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func Parse(r io.Reader) (*Feed, error) {
|
func Parse(r io.Reader) (*Feed, error) {
|
||||||
|
return ParseWithEncoding(r, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseWithEncoding(r io.Reader, fallbackEncoding string) (*Feed, error) {
|
||||||
lookup := make([]byte, 2048)
|
lookup := make([]byte, 2048)
|
||||||
n, err := io.ReadFull(r, lookup)
|
n, err := io.ReadFull(r, lookup)
|
||||||
switch {
|
switch {
|
||||||
@ -63,11 +74,18 @@ func Parse(r io.Reader) (*Feed, error) {
|
|||||||
r = io.MultiReader(bytes.NewReader(lookup), r)
|
r = io.MultiReader(bytes.NewReader(lookup), r)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, callback := sniff(string(lookup))
|
_, decode, callback := sniff(string(lookup))
|
||||||
if callback == nil {
|
if callback == nil {
|
||||||
return nil, UnknownFormat
|
return nil, UnknownFormat
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if decode && fallbackEncoding != "" {
|
||||||
|
r, err = charset.NewReaderLabel(fallbackEncoding, r)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
feed, err := callback(r)
|
feed, err := callback(r)
|
||||||
if feed != nil {
|
if feed != nil {
|
||||||
feed.cleanup()
|
feed.cleanup()
|
||||||
@ -75,8 +93,8 @@ func Parse(r io.Reader) (*Feed, error) {
|
|||||||
return feed, err
|
return feed, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func ParseAndFix(r io.Reader, baseURL string) (*Feed, error) {
|
func ParseAndFix(r io.Reader, baseURL, fallbackEncoding string) (*Feed, error) {
|
||||||
feed, err := Parse(r)
|
feed, err := ParseWithEncoding(r, fallbackEncoding)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -457,14 +457,13 @@ func (s *Server) handlePageCrawl(c *router.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
res, err := worker.GetHTTP(url)
|
body, err := worker.GetBody(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
c.Out.WriteHeader(http.StatusBadRequest)
|
c.Out.WriteHeader(http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer res.Body.Close()
|
content, err := readability.ExtractContent(strings.NewReader(body))
|
||||||
content, err := readability.ExtractContent(res.Body)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
c.Out.WriteHeader(http.StatusNoContent)
|
c.Out.WriteHeader(http.StatusNoContent)
|
||||||
|
@ -50,16 +50,3 @@ func init() {
|
|||||||
userAgent: "Yarr/1.0",
|
userAgent: "Yarr/1.0",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetHTTP(url string) (*http.Response, error) {
|
|
||||||
res, err := client.get(url)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
body, err := httpBody(res)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
res.Body = body
|
|
||||||
return res, nil
|
|
||||||
}
|
|
||||||
|
@ -6,6 +6,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
|
"mime"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
@ -38,18 +39,15 @@ func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
|
|||||||
if res.StatusCode != 200 {
|
if res.StatusCode != 200 {
|
||||||
return nil, fmt.Errorf("status code %d", res.StatusCode)
|
return nil, fmt.Errorf("status code %d", res.StatusCode)
|
||||||
}
|
}
|
||||||
|
cs := getCharset(res)
|
||||||
|
|
||||||
body, err := httpBody(res)
|
body, err := io.ReadAll(res.Body)
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
content, err := ioutil.ReadAll(body)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to feed into parser
|
// Try to feed into parser
|
||||||
feed, err := parser.ParseAndFix(bytes.NewReader(content), candidateUrl)
|
feed, err := parser.ParseAndFix(bytes.NewReader(body), candidateUrl, cs)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
result.Feed = feed
|
result.Feed = feed
|
||||||
result.FeedLink = candidateUrl
|
result.FeedLink = candidateUrl
|
||||||
@ -57,8 +55,16 @@ func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Possibly an html link. Search for feed links
|
// Possibly an html link. Search for feed links
|
||||||
|
content := string(body)
|
||||||
|
if cs != "" {
|
||||||
|
if r, err := charset.NewReaderLabel(cs, bytes.NewReader(body)); err == nil {
|
||||||
|
if body, err := io.ReadAll(r); err == nil {
|
||||||
|
content = string(body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
sources := make([]FeedSource, 0)
|
sources := make([]FeedSource, 0)
|
||||||
for url, title := range scraper.FindFeeds(string(content), candidateUrl) {
|
for url, title := range scraper.FindFeeds(content, candidateUrl) {
|
||||||
sources = append(sources, FeedSource{Title: title, Url: url})
|
sources = append(sources, FeedSource{Title: title, Url: url})
|
||||||
}
|
}
|
||||||
switch {
|
switch {
|
||||||
@ -184,12 +190,7 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err := httpBody(res)
|
feed, err := parser.ParseAndFix(res.Body, f.FeedLink, getCharset(res))
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
feed, err := parser.ParseAndFix(body, f.FeedLink)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -202,14 +203,39 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
|
|||||||
return ConvertItems(feed.Items, f), nil
|
return ConvertItems(feed.Items, f), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func httpBody(res *http.Response) (io.ReadCloser, error) {
|
func getCharset(res *http.Response) string {
|
||||||
|
contentType := res.Header.Get("Content-Type")
|
||||||
|
if _, params, err := mime.ParseMediaType(contentType); err == nil {
|
||||||
|
if cs, ok := params["charset"]; ok {
|
||||||
|
if e, _ := charset.Lookup(cs); e != nil {
|
||||||
|
return cs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetBody(url string) (string, error) {
|
||||||
|
res, err := client.get(url)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
|
||||||
|
var r io.Reader
|
||||||
|
|
||||||
ctype := res.Header.Get("Content-Type")
|
ctype := res.Header.Get("Content-Type")
|
||||||
if strings.Contains(ctype, "charset") {
|
if strings.Contains(ctype, "charset") {
|
||||||
reader, err := charset.NewReader(res.Body, ctype)
|
r, err = charset.NewReader(res.Body, ctype)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return "", err
|
||||||
}
|
}
|
||||||
return io.NopCloser(reader), nil
|
} else {
|
||||||
|
r = res.Body
|
||||||
}
|
}
|
||||||
return res.Body, nil
|
body, err := io.ReadAll(r)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return string(body), nil
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user