mirror of
https://github.com/nkanaev/yarr.git
synced 2025-07-08 16:00:11 +00:00
handle invalid chars in non-utf8 xml
This commit is contained in:
parent
18221ef12d
commit
be7af0ccaf
@ -16,18 +16,20 @@ import (
|
||||
|
||||
var UnknownFormat = errors.New("unknown feed format")
|
||||
|
||||
type processor func(r io.Reader) (*Feed, error)
|
||||
type feedProbe struct {
|
||||
feedType string
|
||||
callback func(r io.Reader) (*Feed, error)
|
||||
encoding string
|
||||
}
|
||||
|
||||
func sniff(lookup string) (string, bool, processor) {
|
||||
func sniff(lookup string) (out feedProbe) {
|
||||
lookup = strings.TrimSpace(lookup)
|
||||
lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF")
|
||||
|
||||
if len(lookup) == 0 {
|
||||
return "", false, nil
|
||||
return
|
||||
}
|
||||
|
||||
var decode bool
|
||||
|
||||
switch lookup[0] {
|
||||
case '<':
|
||||
decoder := xmlDecoder(strings.NewReader(lookup))
|
||||
@ -36,25 +38,35 @@ func sniff(lookup string) (string, bool, processor) {
|
||||
if token == nil {
|
||||
break
|
||||
}
|
||||
// check for absence of xml encoding <?xml encoding="ENCODING" ?>
|
||||
|
||||
// check <?xml encoding="ENCODING" ?>
|
||||
if el, ok := token.(xml.ProcInst); ok && el.Target == "xml" {
|
||||
decode = strings.Index(string(el.Inst), "encoding=") == -1
|
||||
out.encoding = strings.ToLower(procInst("encoding", string(el.Inst)))
|
||||
}
|
||||
|
||||
if el, ok := token.(xml.StartElement); ok {
|
||||
switch el.Name.Local {
|
||||
case "rss":
|
||||
return "rss", decode, ParseRSS
|
||||
out.feedType = "rss"
|
||||
out.callback = ParseRSS
|
||||
return
|
||||
case "RDF":
|
||||
return "rdf", decode, ParseRDF
|
||||
out.feedType = "rdf"
|
||||
out.callback = ParseRDF
|
||||
return
|
||||
case "feed":
|
||||
return "atom", decode, ParseAtom
|
||||
out.feedType = "atom"
|
||||
out.callback = ParseAtom
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
case '{':
|
||||
return "json", true, ParseJSON
|
||||
out.feedType = "json"
|
||||
out.callback = ParseJSON
|
||||
return
|
||||
}
|
||||
return "", false, nil
|
||||
return
|
||||
}
|
||||
|
||||
func Parse(r io.Reader) (*Feed, error) {
|
||||
@ -74,19 +86,26 @@ func ParseWithEncoding(r io.Reader, fallbackEncoding string) (*Feed, error) {
|
||||
r = io.MultiReader(bytes.NewReader(lookup), r)
|
||||
}
|
||||
|
||||
_, decode, callback := sniff(string(lookup))
|
||||
if callback == nil {
|
||||
out := sniff(string(lookup))
|
||||
if out.feedType == "" {
|
||||
return nil, UnknownFormat
|
||||
}
|
||||
|
||||
if decode && fallbackEncoding != "" {
|
||||
if out.encoding == "" && fallbackEncoding != "" {
|
||||
r, err = charset.NewReaderLabel(fallbackEncoding, r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
feed, err := callback(r)
|
||||
if (out.feedType != "json") && (out.encoding == "" || out.encoding == "utf-8") {
|
||||
// XML decoder will not rely on custom CharsetReader (see `xmlDecoder`)
|
||||
// to handle invalid xml characters.
|
||||
// Assume input is already UTF-8 and do the cleanup here.
|
||||
r = NewSafeXMLReader(r)
|
||||
}
|
||||
|
||||
feed, err := out.callback(r)
|
||||
if feed != nil {
|
||||
feed.cleanup()
|
||||
}
|
||||
|
@ -7,38 +7,40 @@ import (
|
||||
)
|
||||
|
||||
func TestSniff(t *testing.T) {
|
||||
testcases := [][2]string{
|
||||
testcases := []struct{
|
||||
input string
|
||||
want feedProbe
|
||||
}{
|
||||
{
|
||||
`<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>`,
|
||||
"rdf",
|
||||
feedProbe{feedType: "rdf", callback: ParseRDF},
|
||||
},
|
||||
{
|
||||
`<?xml version="1.0" encoding="ISO-8859-1"?><rss version="2.0"><channel></channel></rss>`,
|
||||
"rss",
|
||||
feedProbe{feedType: "rss", callback: ParseRSS, encoding: "iso-8859-1"},
|
||||
},
|
||||
{
|
||||
`<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`,
|
||||
"rss",
|
||||
feedProbe{feedType: "rss", callback: ParseRSS},
|
||||
},
|
||||
{
|
||||
`<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`,
|
||||
"atom",
|
||||
feedProbe{feedType: "atom", callback: ParseAtom, encoding: "utf-8"},
|
||||
},
|
||||
{
|
||||
`{}`,
|
||||
"json",
|
||||
feedProbe{feedType: "json", callback: ParseJSON},
|
||||
},
|
||||
{
|
||||
`<!DOCTYPE html><html><head><title></title></head><body></body></html>`,
|
||||
"",
|
||||
feedProbe{},
|
||||
},
|
||||
}
|
||||
for _, testcase := range testcases {
|
||||
have, _, _ := sniff(testcase[0])
|
||||
want := testcase[1]
|
||||
if want != have {
|
||||
t.Log(testcase[0])
|
||||
t.Errorf("Invalid format: want=%#v have=%#v", want, have)
|
||||
want := testcase.want
|
||||
have := sniff(testcase.input)
|
||||
if want.encoding != have.encoding || want.feedType != have.feedType {
|
||||
t.Errorf("Invalid output\n---\n%s\n---\n\nwant=%#v\nhave=%#v", testcase.input, want, have)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -107,3 +109,44 @@ func TestParseFeedWithBOM(t *testing.T) {
|
||||
t.FailNow()
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCleanIllegalCharsInUTF8(t *testing.T) {
|
||||
data := `
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<item>
|
||||
<title>` + "\a" + `title</title>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
`
|
||||
feed, err := Parse(strings.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(feed.Items) != 1 || feed.Items[0].Title != "title" {
|
||||
t.Fatalf("invalid feed, got: %v", feed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCleanIllegalCharsInNonUTF8(t *testing.T) {
|
||||
// echo привет | iconv -f utf8 -t cp1251 | hexdump -C
|
||||
data := `
|
||||
<?xml version="1.0" encoding="windows-1251"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<item>
|
||||
<title>` + "\a \xef\xf0\xe8\xe2\xe5\xf2\x0a \a" + `</title>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
`
|
||||
feed, err := Parse(strings.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(feed.Items) != 1 || feed.Items[0].Title != "привет" {
|
||||
t.Fatalf("invalid feed, got: %v", feed)
|
||||
}
|
||||
}
|
||||
|
@ -30,9 +30,15 @@ func plain2html(text string) string {
|
||||
}
|
||||
|
||||
func xmlDecoder(r io.Reader) *xml.Decoder {
|
||||
decoder := xml.NewDecoder(NewSafeXMLReader(r))
|
||||
decoder := xml.NewDecoder(r)
|
||||
decoder.Strict = false
|
||||
decoder.CharsetReader = charset.NewReaderLabel
|
||||
decoder.CharsetReader = func(cs string, input io.Reader) (io.Reader, error) {
|
||||
r, err := charset.NewReaderLabel(cs, input)
|
||||
if err == nil {
|
||||
r = NewSafeXMLReader(r)
|
||||
}
|
||||
return r, err
|
||||
}
|
||||
return decoder
|
||||
}
|
||||
|
||||
@ -79,3 +85,28 @@ func isInCharacterRange(r rune) (inrange bool) {
|
||||
r >= 0xE000 && r <= 0xFFFD ||
|
||||
r >= 0x10000 && r <= 0x10FFFF
|
||||
}
|
||||
|
||||
// NOTE: copied from "encoding/xml" package
|
||||
// procInst parses the `param="..."` or `param='...'`
|
||||
// value out of the provided string, returning "" if not found.
|
||||
func procInst(param, s string) string {
|
||||
// TODO: this parsing is somewhat lame and not exact.
|
||||
// It works for all actual cases, though.
|
||||
param = param + "="
|
||||
idx := strings.Index(s, param)
|
||||
if idx == -1 {
|
||||
return ""
|
||||
}
|
||||
v := s[idx+len(param):]
|
||||
if v == "" {
|
||||
return ""
|
||||
}
|
||||
if v[0] != '\'' && v[0] != '"' {
|
||||
return ""
|
||||
}
|
||||
idx = strings.IndexRune(v[1:], rune(v[0]))
|
||||
if idx == -1 {
|
||||
return ""
|
||||
}
|
||||
return v[1 : idx+1]
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user