mirror of
https://github.com/nkanaev/yarr.git
synced 2025-05-24 00:33:14 +00:00
strip out invalid xml characters
This commit is contained in:
parent
2de3ddff08
commit
d7253a60b8
@ -1,10 +1,12 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"io"
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
@ -28,8 +30,68 @@ func plain2html(text string) string {
|
||||
}
|
||||
|
||||
func xmlDecoder(r io.Reader) *xml.Decoder {
|
||||
decoder := xml.NewDecoder(r)
|
||||
decoder := xml.NewDecoder(NewSafeXMLReader(r))
|
||||
decoder.Strict = false
|
||||
decoder.CharsetReader = charset.NewReaderLabel
|
||||
return decoder
|
||||
}
|
||||
|
||||
type safexmlreader struct {
|
||||
reader *bufio.Reader
|
||||
buffer []byte
|
||||
isEOF bool
|
||||
runebuf []byte
|
||||
}
|
||||
|
||||
func NewSafeXMLReader(r io.Reader) io.Reader {
|
||||
return &safexmlreader{
|
||||
reader: bufio.NewReader(r),
|
||||
runebuf: make([]byte, 6),
|
||||
}
|
||||
}
|
||||
|
||||
func (xr *safexmlreader) Read(p []byte) (int, error) {
|
||||
if len(p) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
for len(xr.buffer) < cap(p) {
|
||||
r, _, err := xr.reader.ReadRune()
|
||||
if err == io.EOF {
|
||||
xr.isEOF = true
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if isInCharacterRange(r) {
|
||||
size := utf8.EncodeRune(xr.runebuf, r)
|
||||
xr.buffer = append(xr.buffer, xr.runebuf[:size]...)
|
||||
}
|
||||
}
|
||||
|
||||
if xr.isEOF && len(xr.buffer) == 0 {
|
||||
return 0, io.EOF
|
||||
}
|
||||
|
||||
n := cap(p)
|
||||
if len(xr.buffer) < n {
|
||||
n = len(xr.buffer)
|
||||
}
|
||||
copy(p, xr.buffer[:n])
|
||||
xr.buffer = xr.buffer[n:]
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// NOTE: copied from "encoding/xml" package
|
||||
// Decide whether the given rune is in the XML Character Range, per
|
||||
// the Char production of https://www.xml.com/axml/testaxml.htm,
|
||||
// Section 2.2 Characters.
|
||||
func isInCharacterRange(r rune) (inrange bool) {
|
||||
return r == 0x09 ||
|
||||
r == 0x0A ||
|
||||
r == 0x0D ||
|
||||
r >= 0x20 && r <= 0xD7FF ||
|
||||
r >= 0xE000 && r <= 0xFFFD ||
|
||||
r >= 0x10000 && r <= 0x10FFFF
|
||||
}
|
||||
|
88
src/parser/util_test.go
Normal file
88
src/parser/util_test.go
Normal file
@ -0,0 +1,88 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSafeXMLReader(t *testing.T) {
|
||||
var f io.Reader
|
||||
want := []byte("привет мир")
|
||||
f = bytes.NewReader(want)
|
||||
f = NewSafeXMLReader(f)
|
||||
|
||||
have, err := io.ReadAll(f)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !reflect.DeepEqual(want, have) {
|
||||
t.Fatalf("invalid output\nwant: %v\nhave: %v", want, have)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSafeXMLReaderRemoveUnwantedRunes(t *testing.T) {
|
||||
var f io.Reader
|
||||
input := []byte("\aпривет \x0cмир\ufffe\uffff")
|
||||
want := []byte("привет мир")
|
||||
f = bytes.NewReader(input)
|
||||
f = NewSafeXMLReader(f)
|
||||
|
||||
have, err := io.ReadAll(f)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !reflect.DeepEqual(want, have) {
|
||||
t.Fatalf("invalid output\nwant: %v\nhave: %v", want, have)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSafeXMLReaderPartial1(t *testing.T) {
|
||||
var f io.Reader
|
||||
input := []byte("\aпривет \x0cмир\ufffe\uffff")
|
||||
want := []byte("привет мир")
|
||||
f = bytes.NewReader(input)
|
||||
f = NewSafeXMLReader(f)
|
||||
|
||||
buf := make([]byte, 1)
|
||||
for i := 0; i < len(want); i++ {
|
||||
n, err := f.Read(buf)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if n != 1 {
|
||||
t.Fatalf("expected 1 byte, got %d", n)
|
||||
}
|
||||
if buf[0] != want[i] {
|
||||
t.Fatalf("invalid char at pos %d\nwant: %v\nhave: %v", i, want[i], buf[0])
|
||||
}
|
||||
}
|
||||
if x, err := f.Read(buf); err != io.EOF {
|
||||
t.Fatalf("expected EOF, %v, %v %v", buf, x, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSafeXMLReaderPartial2(t *testing.T) {
|
||||
var f io.Reader
|
||||
input := []byte("привет\a\a\a\a\a")
|
||||
f = bytes.NewReader(input)
|
||||
f = NewSafeXMLReader(f)
|
||||
|
||||
buf := make([]byte, 12)
|
||||
n, err := f.Read(buf)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
if n != 12 {
|
||||
t.Fatalf("expected 12 bytes")
|
||||
}
|
||||
|
||||
n, err = f.Read(buf)
|
||||
if n != 0 {
|
||||
t.Fatalf("expected 0")
|
||||
}
|
||||
if err != io.EOF {
|
||||
t.Fatalf("expected EOF, got %v", err)
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user