mirror of
https://github.com/nkanaev/yarr.git
synced 2025-09-14 10:20:06 +00:00
handle invalid chars in non-utf8 xml
This commit is contained in:
@@ -7,38 +7,40 @@ import (
|
||||
)
|
||||
|
||||
func TestSniff(t *testing.T) {
|
||||
testcases := [][2]string{
|
||||
testcases := []struct{
|
||||
input string
|
||||
want feedProbe
|
||||
}{
|
||||
{
|
||||
`<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>`,
|
||||
"rdf",
|
||||
feedProbe{feedType: "rdf", callback: ParseRDF},
|
||||
},
|
||||
{
|
||||
`<?xml version="1.0" encoding="ISO-8859-1"?><rss version="2.0"><channel></channel></rss>`,
|
||||
"rss",
|
||||
feedProbe{feedType: "rss", callback: ParseRSS, encoding: "iso-8859-1"},
|
||||
},
|
||||
{
|
||||
`<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`,
|
||||
"rss",
|
||||
feedProbe{feedType: "rss", callback: ParseRSS},
|
||||
},
|
||||
{
|
||||
`<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`,
|
||||
"atom",
|
||||
feedProbe{feedType: "atom", callback: ParseAtom, encoding: "utf-8"},
|
||||
},
|
||||
{
|
||||
`{}`,
|
||||
"json",
|
||||
feedProbe{feedType: "json", callback: ParseJSON},
|
||||
},
|
||||
{
|
||||
`<!DOCTYPE html><html><head><title></title></head><body></body></html>`,
|
||||
"",
|
||||
feedProbe{},
|
||||
},
|
||||
}
|
||||
for _, testcase := range testcases {
|
||||
have, _, _ := sniff(testcase[0])
|
||||
want := testcase[1]
|
||||
if want != have {
|
||||
t.Log(testcase[0])
|
||||
t.Errorf("Invalid format: want=%#v have=%#v", want, have)
|
||||
want := testcase.want
|
||||
have := sniff(testcase.input)
|
||||
if want.encoding != have.encoding || want.feedType != have.feedType {
|
||||
t.Errorf("Invalid output\n---\n%s\n---\n\nwant=%#v\nhave=%#v", testcase.input, want, have)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -107,3 +109,44 @@ func TestParseFeedWithBOM(t *testing.T) {
|
||||
t.FailNow()
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCleanIllegalCharsInUTF8(t *testing.T) {
|
||||
data := `
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<item>
|
||||
<title>` + "\a" + `title</title>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
`
|
||||
feed, err := Parse(strings.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(feed.Items) != 1 || feed.Items[0].Title != "title" {
|
||||
t.Fatalf("invalid feed, got: %v", feed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCleanIllegalCharsInNonUTF8(t *testing.T) {
|
||||
// echo привет | iconv -f utf8 -t cp1251 | hexdump -C
|
||||
data := `
|
||||
<?xml version="1.0" encoding="windows-1251"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel>
|
||||
<item>
|
||||
<title>` + "\a \xef\xf0\xe8\xe2\xe5\xf2\x0a \a" + `</title>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
`
|
||||
feed, err := Parse(strings.NewReader(data))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(feed.Items) != 1 || feed.Items[0].Title != "привет" {
|
||||
t.Fatalf("invalid feed, got: %v", feed)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user