handle invalid chars in non-utf8 xml

This commit is contained in:
Nazar Kanaev
2022-02-14 15:23:55 +00:00
parent 18221ef12d
commit be7af0ccaf
3 changed files with 123 additions and 30 deletions

View File

@@ -7,38 +7,40 @@ import (
)
func TestSniff(t *testing.T) {
testcases := [][2]string{
testcases := []struct{
input string
want feedProbe
}{
{
`<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>`,
"rdf",
feedProbe{feedType: "rdf", callback: ParseRDF},
},
{
`<?xml version="1.0" encoding="ISO-8859-1"?><rss version="2.0"><channel></channel></rss>`,
"rss",
feedProbe{feedType: "rss", callback: ParseRSS, encoding: "iso-8859-1"},
},
{
`<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`,
"rss",
feedProbe{feedType: "rss", callback: ParseRSS},
},
{
`<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`,
"atom",
feedProbe{feedType: "atom", callback: ParseAtom, encoding: "utf-8"},
},
{
`{}`,
"json",
feedProbe{feedType: "json", callback: ParseJSON},
},
{
`<!DOCTYPE html><html><head><title></title></head><body></body></html>`,
"",
feedProbe{},
},
}
for _, testcase := range testcases {
have, _, _ := sniff(testcase[0])
want := testcase[1]
if want != have {
t.Log(testcase[0])
t.Errorf("Invalid format: want=%#v have=%#v", want, have)
want := testcase.want
have := sniff(testcase.input)
if want.encoding != have.encoding || want.feedType != have.feedType {
t.Errorf("Invalid output\n---\n%s\n---\n\nwant=%#v\nhave=%#v", testcase.input, want, have)
}
}
}
@@ -107,3 +109,44 @@ func TestParseFeedWithBOM(t *testing.T) {
t.FailNow()
}
}
func TestParseCleanIllegalCharsInUTF8(t *testing.T) {
data := `
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<item>
<title>` + "\a" + `title</title>
</item>
</channel>
</rss>
`
feed, err := Parse(strings.NewReader(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Items) != 1 || feed.Items[0].Title != "title" {
t.Fatalf("invalid feed, got: %v", feed)
}
}
func TestParseCleanIllegalCharsInNonUTF8(t *testing.T) {
// echo привет | iconv -f utf8 -t cp1251 | hexdump -C
data := `
<?xml version="1.0" encoding="windows-1251"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<item>
<title>` + "\a \xef\xf0\xe8\xe2\xe5\xf2\x0a \a" + `</title>
</item>
</channel>
</rss>
`
feed, err := Parse(strings.NewReader(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Items) != 1 || feed.Items[0].Title != "привет" {
t.Fatalf("invalid feed, got: %v", feed)
}
}