From be7af0ccafbdc5b9b1423e21628553d16c42c892 Mon Sep 17 00:00:00 2001 From: Nazar Kanaev Date: Mon, 14 Feb 2022 15:23:55 +0000 Subject: [PATCH] handle invalid chars in non-utf8 xml --- src/parser/feed.go | 51 +++++++++++++++++++++---------- src/parser/feed_test.go | 67 +++++++++++++++++++++++++++++++++-------- src/parser/util.go | 35 +++++++++++++++++++-- 3 files changed, 123 insertions(+), 30 deletions(-) diff --git a/src/parser/feed.go b/src/parser/feed.go index 43c7900..fe07161 100644 --- a/src/parser/feed.go +++ b/src/parser/feed.go @@ -16,18 +16,20 @@ import ( var UnknownFormat = errors.New("unknown feed format") -type processor func(r io.Reader) (*Feed, error) +type feedProbe struct { + feedType string + callback func(r io.Reader) (*Feed, error) + encoding string +} -func sniff(lookup string) (string, bool, processor) { +func sniff(lookup string) (out feedProbe) { lookup = strings.TrimSpace(lookup) lookup = strings.TrimLeft(lookup, "\x00\xEF\xBB\xBF\xFE\xFF") if len(lookup) == 0 { - return "", false, nil + return } - var decode bool - switch lookup[0] { case '<': decoder := xmlDecoder(strings.NewReader(lookup)) @@ -36,25 +38,35 @@ func sniff(lookup string) (string, bool, processor) { if token == nil { break } - // check for absence of xml encoding + + // check if el, ok := token.(xml.ProcInst); ok && el.Target == "xml" { - decode = strings.Index(string(el.Inst), "encoding=") == -1 + out.encoding = strings.ToLower(procInst("encoding", string(el.Inst))) } + if el, ok := token.(xml.StartElement); ok { switch el.Name.Local { case "rss": - return "rss", decode, ParseRSS + out.feedType = "rss" + out.callback = ParseRSS + return case "RDF": - return "rdf", decode, ParseRDF + out.feedType = "rdf" + out.callback = ParseRDF + return case "feed": - return "atom", decode, ParseAtom + out.feedType = "atom" + out.callback = ParseAtom + return } } } case '{': - return "json", true, ParseJSON + out.feedType = "json" + out.callback = ParseJSON + return } - return "", false, nil + return } func Parse(r io.Reader) (*Feed, error) { @@ -74,19 +86,26 @@ func ParseWithEncoding(r io.Reader, fallbackEncoding string) (*Feed, error) { r = io.MultiReader(bytes.NewReader(lookup), r) } - _, decode, callback := sniff(string(lookup)) - if callback == nil { + out := sniff(string(lookup)) + if out.feedType == "" { return nil, UnknownFormat } - if decode && fallbackEncoding != "" { + if out.encoding == "" && fallbackEncoding != "" { r, err = charset.NewReaderLabel(fallbackEncoding, r) if err != nil { return nil, err } } - feed, err := callback(r) + if (out.feedType != "json") && (out.encoding == "" || out.encoding == "utf-8") { + // XML decoder will not rely on custom CharsetReader (see `xmlDecoder`) + // to handle invalid xml characters. + // Assume input is already UTF-8 and do the cleanup here. + r = NewSafeXMLReader(r) + } + + feed, err := out.callback(r) if feed != nil { feed.cleanup() } diff --git a/src/parser/feed_test.go b/src/parser/feed_test.go index a28442f..6c6a058 100644 --- a/src/parser/feed_test.go +++ b/src/parser/feed_test.go @@ -7,38 +7,40 @@ import ( ) func TestSniff(t *testing.T) { - testcases := [][2]string{ + testcases := []struct{ + input string + want feedProbe + }{ { ``, - "rdf", + feedProbe{feedType: "rdf", callback: ParseRDF}, }, { ``, - "rss", + feedProbe{feedType: "rss", callback: ParseRSS, encoding: "iso-8859-1"}, }, { ``, - "rss", + feedProbe{feedType: "rss", callback: ParseRSS}, }, { ``, - "atom", + feedProbe{feedType: "atom", callback: ParseAtom, encoding: "utf-8"}, }, { `{}`, - "json", + feedProbe{feedType: "json", callback: ParseJSON}, }, { ``, - "", + feedProbe{}, }, } for _, testcase := range testcases { - have, _, _ := sniff(testcase[0]) - want := testcase[1] - if want != have { - t.Log(testcase[0]) - t.Errorf("Invalid format: want=%#v have=%#v", want, have) + want := testcase.want + have := sniff(testcase.input) + if want.encoding != have.encoding || want.feedType != have.feedType { + t.Errorf("Invalid output\n---\n%s\n---\n\nwant=%#v\nhave=%#v", testcase.input, want, have) } } } @@ -107,3 +109,44 @@ func TestParseFeedWithBOM(t *testing.T) { t.FailNow() } } + +func TestParseCleanIllegalCharsInUTF8(t *testing.T) { + data := ` + + + + + ` + "\a" + `title + + + + ` + feed, err := Parse(strings.NewReader(data)) + if err != nil { + t.Fatal(err) + } + if len(feed.Items) != 1 || feed.Items[0].Title != "title" { + t.Fatalf("invalid feed, got: %v", feed) + } +} + +func TestParseCleanIllegalCharsInNonUTF8(t *testing.T) { + // echo привет | iconv -f utf8 -t cp1251 | hexdump -C + data := ` + + + + + ` + "\a \xef\xf0\xe8\xe2\xe5\xf2\x0a \a" + ` + + + + ` + feed, err := Parse(strings.NewReader(data)) + if err != nil { + t.Fatal(err) + } + if len(feed.Items) != 1 || feed.Items[0].Title != "привет" { + t.Fatalf("invalid feed, got: %v", feed) + } +} diff --git a/src/parser/util.go b/src/parser/util.go index 286bef8..ca1aa38 100644 --- a/src/parser/util.go +++ b/src/parser/util.go @@ -30,9 +30,15 @@ func plain2html(text string) string { } func xmlDecoder(r io.Reader) *xml.Decoder { - decoder := xml.NewDecoder(NewSafeXMLReader(r)) + decoder := xml.NewDecoder(r) decoder.Strict = false - decoder.CharsetReader = charset.NewReaderLabel + decoder.CharsetReader = func(cs string, input io.Reader) (io.Reader, error) { + r, err := charset.NewReaderLabel(cs, input) + if err == nil { + r = NewSafeXMLReader(r) + } + return r, err + } return decoder } @@ -79,3 +85,28 @@ func isInCharacterRange(r rune) (inrange bool) { r >= 0xE000 && r <= 0xFFFD || r >= 0x10000 && r <= 0x10FFFF } + +// NOTE: copied from "encoding/xml" package +// procInst parses the `param="..."` or `param='...'` +// value out of the provided string, returning "" if not found. +func procInst(param, s string) string { + // TODO: this parsing is somewhat lame and not exact. + // It works for all actual cases, though. + param = param + "=" + idx := strings.Index(s, param) + if idx == -1 { + return "" + } + v := s[idx+len(param):] + if v == "" { + return "" + } + if v[0] != '\'' && v[0] != '"' { + return "" + } + idx = strings.IndexRune(v[1:], rune(v[0])) + if idx == -1 { + return "" + } + return v[1 : idx+1] +}