handle google url redirect in page crawler

This commit is contained in:
Nazar Kanaev 2022-08-21 13:31:03 +01:00
parent b935a1c511
commit 698f5d6d06
4 changed files with 45 additions and 1 deletions

View File

@ -105,7 +105,7 @@
return api('post', './logout') return api('post', './logout')
}, },
crawl: function(url) { crawl: function(url) {
return api('get', './page?url=' + url).then(json) return api('get', './page?url=' + encodeURIComponent(url)).then(json)
} }
} }
})() })()

17
src/content/silo/url.go Normal file
View File

@ -0,0 +1,17 @@
package silo
import (
"net/url"
"strings"
)
func RedirectURL(link string) string {
if strings.HasPrefix(link, "https://www.google.com/url?") {
if u, err := url.Parse(link); err == nil {
if u2 := u.Query().Get("url"); u2 != "" {
return u2
}
}
}
return link
}

View File

@ -0,0 +1,24 @@
package silo
import "testing"
func TestRedirectURL(t *testing.T) {
link := "https://www.google.com/url?rct=j&sa=t&url=https://www.cryptoglobe.com/latest/2022/08/investment-strategist-lyn-alden-explains-why-she-is-still-bullish-on-bitcoin-long-term/&ct=ga&cd=CAIyGjlkMjI1NjUyODE3ODFjMDQ6Y29tOmVuOlVT&usg=AOvVaw16C2fJtw6m8QVEbto2HCKK"
want := "https://www.cryptoglobe.com/latest/2022/08/investment-strategist-lyn-alden-explains-why-she-is-still-bullish-on-bitcoin-long-term/"
have := RedirectURL(link)
if have != want {
t.Logf("want: %s", want)
t.Logf("have: %s", have)
t.Fail()
}
link = "https://example.com"
if RedirectURL(link) != link {
t.Fail()
}
link = "https://example.com/url?url=test.com"
if RedirectURL(link) != link {
t.Fail()
}
}

View File

@ -456,6 +456,9 @@ func (s *Server) handleOPMLExport(c *router.Context) {
func (s *Server) handlePageCrawl(c *router.Context) { func (s *Server) handlePageCrawl(c *router.Context) {
url := c.Req.URL.Query().Get("url") url := c.Req.URL.Query().Get("url")
if newUrl := silo.RedirectURL(url); newUrl != "" {
url = newUrl
}
if content := silo.VideoIFrame(url); content != "" { if content := silo.VideoIFrame(url); content != "" {
c.JSON(http.StatusOK, map[string]string{ c.JSON(http.StatusOK, map[string]string{
"content": sanitizer.Sanitize(url, content), "content": sanitizer.Sanitize(url, content),