mirror of
https://github.com/nkanaev/yarr.git
synced 2025-07-08 16:00:11 +00:00
handle invalid chars in non-utf8 xml
This commit is contained in:
parent
18221ef12d
commit
be7af0ccaf
@ -16,18 +16,20 @@ import (
|
|||||||
|
|
||||||
var UnknownFormat = errors.New("unknown feed format")
|
var UnknownFormat = errors.New("unknown feed format")
|
||||||
|
|
||||||
type processor func(r io.Reader) (*Feed, error)
|
type feedProbe struct {
|
||||||
|
feedType string
|
||||||
|
callback func(r io.Reader) (*Feed, error)
|
||||||
|
encoding string
|
||||||
|
}
|
||||||
|
|
||||||
func sniff(lookup string) (string, bool, processor) {
|
func sniff(lookup string) (out feedProbe) {
|
||||||
lookup = strings.TrimSpace(lookup)
|
lookup = strings.TrimSpace(lookup)
|
||||||
lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF")
|
lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF")
|
||||||
|
|
||||||
if len(lookup) == 0 {
|
if len(lookup) == 0 {
|
||||||
return "", false, nil
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var decode bool
|
|
||||||
|
|
||||||
switch lookup[0] {
|
switch lookup[0] {
|
||||||
case '<':
|
case '<':
|
||||||
decoder := xmlDecoder(strings.NewReader(lookup))
|
decoder := xmlDecoder(strings.NewReader(lookup))
|
||||||
@ -36,25 +38,35 @@ func sniff(lookup string) (string, bool, processor) {
|
|||||||
if token == nil {
|
if token == nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
// check for absence of xml encoding <?xml encoding="ENCODING" ?>
|
|
||||||
|
// check <?xml encoding="ENCODING" ?>
|
||||||
if el, ok := token.(xml.ProcInst); ok && el.Target == "xml" {
|
if el, ok := token.(xml.ProcInst); ok && el.Target == "xml" {
|
||||||
decode = strings.Index(string(el.Inst), "encoding=") == -1
|
out.encoding = strings.ToLower(procInst("encoding", string(el.Inst)))
|
||||||
}
|
}
|
||||||
|
|
||||||
if el, ok := token.(xml.StartElement); ok {
|
if el, ok := token.(xml.StartElement); ok {
|
||||||
switch el.Name.Local {
|
switch el.Name.Local {
|
||||||
case "rss":
|
case "rss":
|
||||||
return "rss", decode, ParseRSS
|
out.feedType = "rss"
|
||||||
|
out.callback = ParseRSS
|
||||||
|
return
|
||||||
case "RDF":
|
case "RDF":
|
||||||
return "rdf", decode, ParseRDF
|
out.feedType = "rdf"
|
||||||
|
out.callback = ParseRDF
|
||||||
|
return
|
||||||
case "feed":
|
case "feed":
|
||||||
return "atom", decode, ParseAtom
|
out.feedType = "atom"
|
||||||
|
out.callback = ParseAtom
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case '{':
|
case '{':
|
||||||
return "json", true, ParseJSON
|
out.feedType = "json"
|
||||||
|
out.callback = ParseJSON
|
||||||
|
return
|
||||||
}
|
}
|
||||||
return "", false, nil
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func Parse(r io.Reader) (*Feed, error) {
|
func Parse(r io.Reader) (*Feed, error) {
|
||||||
@ -74,19 +86,26 @@ func ParseWithEncoding(r io.Reader, fallbackEncoding string) (*Feed, error) {
|
|||||||
r = io.MultiReader(bytes.NewReader(lookup), r)
|
r = io.MultiReader(bytes.NewReader(lookup), r)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, decode, callback := sniff(string(lookup))
|
out := sniff(string(lookup))
|
||||||
if callback == nil {
|
if out.feedType == "" {
|
||||||
return nil, UnknownFormat
|
return nil, UnknownFormat
|
||||||
}
|
}
|
||||||
|
|
||||||
if decode && fallbackEncoding != "" {
|
if out.encoding == "" && fallbackEncoding != "" {
|
||||||
r, err = charset.NewReaderLabel(fallbackEncoding, r)
|
r, err = charset.NewReaderLabel(fallbackEncoding, r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
feed, err := callback(r)
|
if (out.feedType != "json") && (out.encoding == "" || out.encoding == "utf-8") {
|
||||||
|
// XML decoder will not rely on custom CharsetReader (see `xmlDecoder`)
|
||||||
|
// to handle invalid xml characters.
|
||||||
|
// Assume input is already UTF-8 and do the cleanup here.
|
||||||
|
r = NewSafeXMLReader(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
feed, err := out.callback(r)
|
||||||
if feed != nil {
|
if feed != nil {
|
||||||
feed.cleanup()
|
feed.cleanup()
|
||||||
}
|
}
|
||||||
|
@ -7,38 +7,40 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestSniff(t *testing.T) {
|
func TestSniff(t *testing.T) {
|
||||||
testcases := [][2]string{
|
testcases := []struct{
|
||||||
|
input string
|
||||||
|
want feedProbe
|
||||||
|
}{
|
||||||
{
|
{
|
||||||
`<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>`,
|
`<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>`,
|
||||||
"rdf",
|
feedProbe{feedType: "rdf", callback: ParseRDF},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
`<?xml version="1.0" encoding="ISO-8859-1"?><rss version="2.0"><channel></channel></rss>`,
|
`<?xml version="1.0" encoding="ISO-8859-1"?><rss version="2.0"><channel></channel></rss>`,
|
||||||
"rss",
|
feedProbe{feedType: "rss", callback: ParseRSS, encoding: "iso-8859-1"},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
`<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`,
|
`<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`,
|
||||||
"rss",
|
feedProbe{feedType: "rss", callback: ParseRSS},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
`<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`,
|
`<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`,
|
||||||
"atom",
|
feedProbe{feedType: "atom", callback: ParseAtom, encoding: "utf-8"},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
`{}`,
|
`{}`,
|
||||||
"json",
|
feedProbe{feedType: "json", callback: ParseJSON},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
`<!DOCTYPE html><html><head><title></title></head><body></body></html>`,
|
`<!DOCTYPE html><html><head><title></title></head><body></body></html>`,
|
||||||
"",
|
feedProbe{},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
for _, testcase := range testcases {
|
for _, testcase := range testcases {
|
||||||
have, _, _ := sniff(testcase[0])
|
want := testcase.want
|
||||||
want := testcase[1]
|
have := sniff(testcase.input)
|
||||||
if want != have {
|
if want.encoding != have.encoding || want.feedType != have.feedType {
|
||||||
t.Log(testcase[0])
|
t.Errorf("Invalid output\n---\n%s\n---\n\nwant=%#v\nhave=%#v", testcase.input, want, have)
|
||||||
t.Errorf("Invalid format: want=%#v have=%#v", want, have)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -107,3 +109,44 @@ func TestParseFeedWithBOM(t *testing.T) {
|
|||||||
t.FailNow()
|
t.FailNow()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseCleanIllegalCharsInUTF8(t *testing.T) {
|
||||||
|
data := `
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<title>` + "\a" + `title</title>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
`
|
||||||
|
feed, err := Parse(strings.NewReader(data))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if len(feed.Items) != 1 || feed.Items[0].Title != "title" {
|
||||||
|
t.Fatalf("invalid feed, got: %v", feed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseCleanIllegalCharsInNonUTF8(t *testing.T) {
|
||||||
|
// echo привет | iconv -f utf8 -t cp1251 | hexdump -C
|
||||||
|
data := `
|
||||||
|
<?xml version="1.0" encoding="windows-1251"?>
|
||||||
|
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<title>` + "\a \xef\xf0\xe8\xe2\xe5\xf2\x0a \a" + `</title>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
`
|
||||||
|
feed, err := Parse(strings.NewReader(data))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if len(feed.Items) != 1 || feed.Items[0].Title != "привет" {
|
||||||
|
t.Fatalf("invalid feed, got: %v", feed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -30,9 +30,15 @@ func plain2html(text string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func xmlDecoder(r io.Reader) *xml.Decoder {
|
func xmlDecoder(r io.Reader) *xml.Decoder {
|
||||||
decoder := xml.NewDecoder(NewSafeXMLReader(r))
|
decoder := xml.NewDecoder(r)
|
||||||
decoder.Strict = false
|
decoder.Strict = false
|
||||||
decoder.CharsetReader = charset.NewReaderLabel
|
decoder.CharsetReader = func(cs string, input io.Reader) (io.Reader, error) {
|
||||||
|
r, err := charset.NewReaderLabel(cs, input)
|
||||||
|
if err == nil {
|
||||||
|
r = NewSafeXMLReader(r)
|
||||||
|
}
|
||||||
|
return r, err
|
||||||
|
}
|
||||||
return decoder
|
return decoder
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,3 +85,28 @@ func isInCharacterRange(r rune) (inrange bool) {
|
|||||||
r >= 0xE000 && r <= 0xFFFD ||
|
r >= 0xE000 && r <= 0xFFFD ||
|
||||||
r >= 0x10000 && r <= 0x10FFFF
|
r >= 0x10000 && r <= 0x10FFFF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE: copied from "encoding/xml" package
|
||||||
|
// procInst parses the `param="..."` or `param='...'`
|
||||||
|
// value out of the provided string, returning "" if not found.
|
||||||
|
func procInst(param, s string) string {
|
||||||
|
// TODO: this parsing is somewhat lame and not exact.
|
||||||
|
// It works for all actual cases, though.
|
||||||
|
param = param + "="
|
||||||
|
idx := strings.Index(s, param)
|
||||||
|
if idx == -1 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
v := s[idx+len(param):]
|
||||||
|
if v == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if v[0] != '\'' && v[0] != '"' {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
idx = strings.IndexRune(v[1:], rune(v[0]))
|
||||||
|
if idx == -1 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return v[1 : idx+1]
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user