handle invalid chars in non-utf8 xml

This commit is contained in:
Nazar Kanaev 2022-02-14 15:23:55 +00:00
parent 18221ef12d
commit be7af0ccaf
3 changed files with 123 additions and 30 deletions

View File

@ -16,18 +16,20 @@ import (
var UnknownFormat = errors.New("unknown feed format") var UnknownFormat = errors.New("unknown feed format")
type processor func(r io.Reader) (*Feed, error) type feedProbe struct {
feedType string
callback func(r io.Reader) (*Feed, error)
encoding string
}
func sniff(lookup string) (string, bool, processor) { func sniff(lookup string) (out feedProbe) {
lookup = strings.TrimSpace(lookup) lookup = strings.TrimSpace(lookup)
lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF") lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF")
if len(lookup) == 0 { if len(lookup) == 0 {
return "", false, nil return
} }
var decode bool
switch lookup[0] { switch lookup[0] {
case '<': case '<':
decoder := xmlDecoder(strings.NewReader(lookup)) decoder := xmlDecoder(strings.NewReader(lookup))
@ -36,25 +38,35 @@ func sniff(lookup string) (string, bool, processor) {
if token == nil { if token == nil {
break break
} }
// check for absence of xml encoding <?xml encoding="ENCODING" ?>
// check <?xml encoding="ENCODING" ?>
if el, ok := token.(xml.ProcInst); ok && el.Target == "xml" { if el, ok := token.(xml.ProcInst); ok && el.Target == "xml" {
decode = strings.Index(string(el.Inst), "encoding=") == -1 out.encoding = strings.ToLower(procInst("encoding", string(el.Inst)))
} }
if el, ok := token.(xml.StartElement); ok { if el, ok := token.(xml.StartElement); ok {
switch el.Name.Local { switch el.Name.Local {
case "rss": case "rss":
return "rss", decode, ParseRSS out.feedType = "rss"
out.callback = ParseRSS
return
case "RDF": case "RDF":
return "rdf", decode, ParseRDF out.feedType = "rdf"
out.callback = ParseRDF
return
case "feed": case "feed":
return "atom", decode, ParseAtom out.feedType = "atom"
out.callback = ParseAtom
return
} }
} }
} }
case '{': case '{':
return "json", true, ParseJSON out.feedType = "json"
out.callback = ParseJSON
return
} }
return "", false, nil return
} }
func Parse(r io.Reader) (*Feed, error) { func Parse(r io.Reader) (*Feed, error) {
@ -74,19 +86,26 @@ func ParseWithEncoding(r io.Reader, fallbackEncoding string) (*Feed, error) {
r = io.MultiReader(bytes.NewReader(lookup), r) r = io.MultiReader(bytes.NewReader(lookup), r)
} }
_, decode, callback := sniff(string(lookup)) out := sniff(string(lookup))
if callback == nil { if out.feedType == "" {
return nil, UnknownFormat return nil, UnknownFormat
} }
if decode && fallbackEncoding != "" { if out.encoding == "" && fallbackEncoding != "" {
r, err = charset.NewReaderLabel(fallbackEncoding, r) r, err = charset.NewReaderLabel(fallbackEncoding, r)
if err != nil { if err != nil {
return nil, err return nil, err
} }
} }
feed, err := callback(r) if (out.feedType != "json") && (out.encoding == "" || out.encoding == "utf-8") {
// XML decoder will not rely on custom CharsetReader (see `xmlDecoder`)
// to handle invalid xml characters.
// Assume input is already UTF-8 and do the cleanup here.
r = NewSafeXMLReader(r)
}
feed, err := out.callback(r)
if feed != nil { if feed != nil {
feed.cleanup() feed.cleanup()
} }

View File

@ -7,38 +7,40 @@ import (
) )
func TestSniff(t *testing.T) { func TestSniff(t *testing.T) {
testcases := [][2]string{ testcases := []struct{
input string
want feedProbe
}{
{ {
`<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>`, `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>`,
"rdf", feedProbe{feedType: "rdf", callback: ParseRDF},
}, },
{ {
`<?xml version="1.0" encoding="ISO-8859-1"?><rss version="2.0"><channel></channel></rss>`, `<?xml version="1.0" encoding="ISO-8859-1"?><rss version="2.0"><channel></channel></rss>`,
"rss", feedProbe{feedType: "rss", callback: ParseRSS, encoding: "iso-8859-1"},
}, },
{ {
`<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`, `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`,
"rss", feedProbe{feedType: "rss", callback: ParseRSS},
}, },
{ {
`<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`, `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`,
"atom", feedProbe{feedType: "atom", callback: ParseAtom, encoding: "utf-8"},
}, },
{ {
`{}`, `{}`,
"json", feedProbe{feedType: "json", callback: ParseJSON},
}, },
{ {
`<!DOCTYPE html><html><head><title></title></head><body></body></html>`, `<!DOCTYPE html><html><head><title></title></head><body></body></html>`,
"", feedProbe{},
}, },
} }
for _, testcase := range testcases { for _, testcase := range testcases {
have, _, _ := sniff(testcase[0]) want := testcase.want
want := testcase[1] have := sniff(testcase.input)
if want != have { if want.encoding != have.encoding || want.feedType != have.feedType {
t.Log(testcase[0]) t.Errorf("Invalid output\n---\n%s\n---\n\nwant=%#v\nhave=%#v", testcase.input, want, have)
t.Errorf("Invalid format: want=%#v have=%#v", want, have)
} }
} }
} }
@ -107,3 +109,44 @@ func TestParseFeedWithBOM(t *testing.T) {
t.FailNow() t.FailNow()
} }
} }
func TestParseCleanIllegalCharsInUTF8(t *testing.T) {
data := `
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<item>
<title>` + "\a" + `title</title>
</item>
</channel>
</rss>
`
feed, err := Parse(strings.NewReader(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Items) != 1 || feed.Items[0].Title != "title" {
t.Fatalf("invalid feed, got: %v", feed)
}
}
func TestParseCleanIllegalCharsInNonUTF8(t *testing.T) {
// echo привет | iconv -f utf8 -t cp1251 | hexdump -C
data := `
<?xml version="1.0" encoding="windows-1251"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<item>
<title>` + "\a \xef\xf0\xe8\xe2\xe5\xf2\x0a \a" + `</title>
</item>
</channel>
</rss>
`
feed, err := Parse(strings.NewReader(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Items) != 1 || feed.Items[0].Title != "привет" {
t.Fatalf("invalid feed, got: %v", feed)
}
}

View File

@ -30,9 +30,15 @@ func plain2html(text string) string {
} }
func xmlDecoder(r io.Reader) *xml.Decoder { func xmlDecoder(r io.Reader) *xml.Decoder {
decoder := xml.NewDecoder(NewSafeXMLReader(r)) decoder := xml.NewDecoder(r)
decoder.Strict = false decoder.Strict = false
decoder.CharsetReader = charset.NewReaderLabel decoder.CharsetReader = func(cs string, input io.Reader) (io.Reader, error) {
r, err := charset.NewReaderLabel(cs, input)
if err == nil {
r = NewSafeXMLReader(r)
}
return r, err
}
return decoder return decoder
} }
@ -79,3 +85,28 @@ func isInCharacterRange(r rune) (inrange bool) {
r >= 0xE000 && r <= 0xFFFD || r >= 0xE000 && r <= 0xFFFD ||
r >= 0x10000 && r <= 0x10FFFF r >= 0x10000 && r <= 0x10FFFF
} }
// NOTE: copied from "encoding/xml" package
// procInst parses the `param="..."` or `param='...'`
// value out of the provided string, returning "" if not found.
func procInst(param, s string) string {
// TODO: this parsing is somewhat lame and not exact.
// It works for all actual cases, though.
param = param + "="
idx := strings.Index(s, param)
if idx == -1 {
return ""
}
v := s[idx+len(param):]
if v == "" {
return ""
}
if v[0] != '\'' && v[0] != '"' {
return ""
}
idx = strings.IndexRune(v[1:], rune(v[0]))
if idx == -1 {
return ""
}
return v[1 : idx+1]
}