fix encoding

This commit is contained in:
nkanaev 2022-01-24 16:47:32 +00:00
parent e3e9542f1e
commit 52cc8ecbbd
4 changed files with 75 additions and 45 deletions

View File

@ -11,20 +11,23 @@ import (
"time" "time"
"github.com/nkanaev/yarr/src/content/htmlutil" "github.com/nkanaev/yarr/src/content/htmlutil"
"golang.org/x/net/html/charset"
) )
var UnknownFormat = errors.New("unknown feed format") var UnknownFormat = errors.New("unknown feed format")
type processor func(r io.Reader) (*Feed, error) type processor func(r io.Reader) (*Feed, error)
func sniff(lookup string) (string, processor) { func sniff(lookup string) (string, bool, processor) {
lookup = strings.TrimSpace(lookup) lookup = strings.TrimSpace(lookup)
lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF") lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF")
if len(lookup) < 0 { if len(lookup) == 0 {
return "", nil return "", false, nil
} }
var decode bool
switch lookup[0] { switch lookup[0] {
case '<': case '<':
decoder := xmlDecoder(strings.NewReader(lookup)) decoder := xmlDecoder(strings.NewReader(lookup))
@ -33,24 +36,32 @@ func sniff(lookup string) (string, processor) {
if token == nil { if token == nil {
break break
} }
// check for absence of xml encoding <?xml encoding="ENCODING" ?>
if el, ok := token.(xml.ProcInst); ok && el.Target == "xml" {
decode = strings.Index(string(el.Inst), "encoding=") == -1
}
if el, ok := token.(xml.StartElement); ok { if el, ok := token.(xml.StartElement); ok {
switch el.Name.Local { switch el.Name.Local {
case "rss": case "rss":
return "rss", ParseRSS return "rss", decode, ParseRSS
case "RDF": case "RDF":
return "rdf", ParseRDF return "rdf", decode, ParseRDF
case "feed": case "feed":
return "atom", ParseAtom return "atom", decode, ParseAtom
} }
} }
} }
case '{': case '{':
return "json", ParseJSON return "json", true, ParseJSON
} }
return "", nil return "", false, nil
} }
func Parse(r io.Reader) (*Feed, error) { func Parse(r io.Reader) (*Feed, error) {
return ParseWithEncoding(r, "")
}
func ParseWithEncoding(r io.Reader, fallbackEncoding string) (*Feed, error) {
lookup := make([]byte, 2048) lookup := make([]byte, 2048)
n, err := io.ReadFull(r, lookup) n, err := io.ReadFull(r, lookup)
switch { switch {
@ -63,11 +74,18 @@ func Parse(r io.Reader) (*Feed, error) {
r = io.MultiReader(bytes.NewReader(lookup), r) r = io.MultiReader(bytes.NewReader(lookup), r)
} }
_, callback := sniff(string(lookup)) _, decode, callback := sniff(string(lookup))
if callback == nil { if callback == nil {
return nil, UnknownFormat return nil, UnknownFormat
} }
if decode && fallbackEncoding != "" {
r, err = charset.NewReaderLabel(fallbackEncoding, r)
if err != nil {
return nil, err
}
}
feed, err := callback(r) feed, err := callback(r)
if feed != nil { if feed != nil {
feed.cleanup() feed.cleanup()
@ -75,8 +93,8 @@ func Parse(r io.Reader) (*Feed, error) {
return feed, err return feed, err
} }
func ParseAndFix(r io.Reader, baseURL string) (*Feed, error) { func ParseAndFix(r io.Reader, baseURL, fallbackEncoding string) (*Feed, error) {
feed, err := Parse(r) feed, err := ParseWithEncoding(r, fallbackEncoding)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -457,14 +457,13 @@ func (s *Server) handlePageCrawl(c *router.Context) {
return return
} }
res, err := worker.GetHTTP(url) body, err := worker.GetBody(url)
if err != nil { if err != nil {
log.Print(err) log.Print(err)
c.Out.WriteHeader(http.StatusBadRequest) c.Out.WriteHeader(http.StatusBadRequest)
return return
} }
defer res.Body.Close() content, err := readability.ExtractContent(strings.NewReader(body))
content, err := readability.ExtractContent(res.Body)
if err != nil { if err != nil {
log.Print(err) log.Print(err)
c.Out.WriteHeader(http.StatusNoContent) c.Out.WriteHeader(http.StatusNoContent)

View File

@ -50,16 +50,3 @@ func init() {
userAgent: "Yarr/1.0", userAgent: "Yarr/1.0",
} }
} }
func GetHTTP(url string) (*http.Response, error) {
res, err := client.get(url)
if err != nil {
return nil, err
}
body, err := httpBody(res)
if err != nil {
return nil, err
}
res.Body = body
return res, nil
}

View File

@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"mime"
"net/http" "net/http"
"net/url" "net/url"
"strings" "strings"
@ -38,18 +39,15 @@ func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
if res.StatusCode != 200 { if res.StatusCode != 200 {
return nil, fmt.Errorf("status code %d", res.StatusCode) return nil, fmt.Errorf("status code %d", res.StatusCode)
} }
cs := getCharset(res)
body, err := httpBody(res) body, err := io.ReadAll(res.Body)
if err != nil {
return nil, err
}
content, err := ioutil.ReadAll(body)
if err != nil { if err != nil {
return nil, err return nil, err
} }
// Try to feed into parser // Try to feed into parser
feed, err := parser.ParseAndFix(bytes.NewReader(content), candidateUrl) feed, err := parser.ParseAndFix(bytes.NewReader(body), candidateUrl, cs)
if err == nil { if err == nil {
result.Feed = feed result.Feed = feed
result.FeedLink = candidateUrl result.FeedLink = candidateUrl
@ -57,8 +55,16 @@ func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
} }
// Possibly an html link. Search for feed links // Possibly an html link. Search for feed links
content := string(body)
if cs != "" {
if r, err := charset.NewReaderLabel(cs, bytes.NewReader(body)); err == nil {
if body, err := io.ReadAll(r); err == nil {
content = string(body)
}
}
}
sources := make([]FeedSource, 0) sources := make([]FeedSource, 0)
for url, title := range scraper.FindFeeds(string(content), candidateUrl) { for url, title := range scraper.FindFeeds(content, candidateUrl) {
sources = append(sources, FeedSource{Title: title, Url: url}) sources = append(sources, FeedSource{Title: title, Url: url})
} }
switch { switch {
@ -184,12 +190,7 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
return nil, nil return nil, nil
} }
body, err := httpBody(res) feed, err := parser.ParseAndFix(res.Body, f.FeedLink, getCharset(res))
if err != nil {
return nil, err
}
feed, err := parser.ParseAndFix(body, f.FeedLink)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -202,14 +203,39 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
return ConvertItems(feed.Items, f), nil return ConvertItems(feed.Items, f), nil
} }
func httpBody(res *http.Response) (io.ReadCloser, error) { func getCharset(res *http.Response) string {
contentType := res.Header.Get("Content-Type")
if _, params, err := mime.ParseMediaType(contentType); err == nil {
if cs, ok := params["charset"]; ok {
if e, _ := charset.Lookup(cs); e != nil {
return cs
}
}
}
return ""
}
func GetBody(url string) (string, error) {
res, err := client.get(url)
if err != nil {
return "", err
}
defer res.Body.Close()
var r io.Reader
ctype := res.Header.Get("Content-Type") ctype := res.Header.Get("Content-Type")
if strings.Contains(ctype, "charset") { if strings.Contains(ctype, "charset") {
reader, err := charset.NewReader(res.Body, ctype) r, err = charset.NewReader(res.Body, ctype)
if err != nil { if err != nil {
return nil, err return "", err
} }
return io.NopCloser(reader), nil } else {
r = res.Body
} }
return res.Body, nil body, err := io.ReadAll(r)
if err != nil {
return "", err
}
return string(body), nil
} }