handle invalid chars in non-utf8 xml

This commit is contained in:
Nazar Kanaev
2022-02-14 15:23:55 +00:00
parent 18221ef12d
commit be7af0ccaf
3 changed files with 123 additions and 30 deletions

View File

@@ -30,9 +30,15 @@ func plain2html(text string) string {
}
func xmlDecoder(r io.Reader) *xml.Decoder {
decoder := xml.NewDecoder(NewSafeXMLReader(r))
decoder := xml.NewDecoder(r)
decoder.Strict = false
decoder.CharsetReader = charset.NewReaderLabel
decoder.CharsetReader = func(cs string, input io.Reader) (io.Reader, error) {
r, err := charset.NewReaderLabel(cs, input)
if err == nil {
r = NewSafeXMLReader(r)
}
return r, err
}
return decoder
}
@@ -79,3 +85,28 @@ func isInCharacterRange(r rune) (inrange bool) {
r >= 0xE000 && r <= 0xFFFD ||
r >= 0x10000 && r <= 0x10FFFF
}
// NOTE: copied from "encoding/xml" package
// procInst parses the `param="..."` or `param='...'`
// value out of the provided string, returning "" if not found.
func procInst(param, s string) string {
// TODO: this parsing is somewhat lame and not exact.
// It works for all actual cases, though.
param = param + "="
idx := strings.Index(s, param)
if idx == -1 {
return ""
}
v := s[idx+len(param):]
if v == "" {
return ""
}
if v[0] != '\'' && v[0] != '"' {
return ""
}
idx = strings.IndexRune(v[1:], rune(v[0]))
if idx == -1 {
return ""
}
return v[1 : idx+1]
}