Pull atom xhtml title from nested elements

The Atom spec says that any title marked with a type of "xhtml" should be
contained in a div element[1] so we need to use the full XML text when
extracting the text.

[1] https://www.rfc-editor.org/rfc/rfc4287#section-3.1
This commit is contained in:
Will Harding 2023-09-17 15:11:02 -07:00 committed by nkanaev
parent c76ff26bd6
commit 3adcddc70c
2 changed files with 40 additions and 0 deletions

View File

@ -47,6 +47,8 @@ type atomLinks []atomLink
func (a *atomText) Text() string { func (a *atomText) Text() string {
if a.Type == "html" { if a.Type == "html" {
return htmlutil.ExtractText(a.Data) return htmlutil.ExtractText(a.Data)
} else if a.Type == "xhtml" {
return htmlutil.ExtractText(a.XML)
} }
return a.Data return a.Data
} }

View File

@ -94,6 +94,44 @@ func TestAtomHTMLTitle(t *testing.T) {
} }
} }
func TestAtomXHTMLTitle(t *testing.T) {
feed, _ := Parse(strings.NewReader(`
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry><title type="xhtml">say &lt;code&gt;what&lt;/code&gt;?</entry>
</feed>
`))
have := feed.Items[0].Title
want := "say what?"
if !reflect.DeepEqual(want, have) {
t.Logf("want: %#v", want)
t.Logf("have: %#v", have)
t.FailNow()
}
}
func TestAtomXHTMLNestedTitle(t *testing.T) {
feed, _ := Parse(strings.NewReader(`
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<a href="https://example.com">Link to Example</a>
</div>
</title>
</entry>
</feed>
`))
have := feed.Items[0].Title
want := "Link to Example"
if !reflect.DeepEqual(want, have) {
t.Logf("want: %#v", want)
t.Logf("have: %#v", have)
t.FailNow()
}
}
func TestAtomImageLink(t *testing.T) { func TestAtomImageLink(t *testing.T) {
feed, _ := Parse(strings.NewReader(` feed, _ := Parse(strings.NewReader(`
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>