mirror of
				https://github.com/nkanaev/yarr.git
				synced 2025-10-29 22:29:59 +00:00 
			
		
		
		
	refactor crawler
This commit is contained in:
		| @@ -155,35 +155,32 @@ func (s *Server) handleFeedList(c *router.Context) { | |||||||
| 			return | 			return | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		feed, feedUrl, sources, err := worker.DiscoverFeed(form.Url) | 		result, err := worker.DiscoverFeed(form.Url) | ||||||
| 		if err != nil { | 		switch { | ||||||
| 			log.Print(err) | 		case err != nil: | ||||||
|  | 			log.Printf("Faild to discover feed for %s: %s", form.Url, err) | ||||||
| 			c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) | 			c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) | ||||||
| 			return | 		case len(result.Sources) > 0: | ||||||
| 		} | 			c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": result.Sources}) | ||||||
|  | 		case result.Feed != nil: | ||||||
| 		if feed != nil { | 			feed := s.db.CreateFeed( | ||||||
| 			storedFeed := s.db.CreateFeed( | 				result.Feed.Title, | ||||||
| 				feed.Title, |  | ||||||
| 				"", | 				"", | ||||||
| 				feed.SiteURL, | 				result.Feed.SiteURL, | ||||||
| 				feedUrl, | 				result.FeedLink, | ||||||
| 				form.FolderID, | 				form.FolderID, | ||||||
| 			) | 			) | ||||||
| 			s.db.CreateItems(worker.ConvertItems(feed.Items, *storedFeed)) | 			s.db.CreateItems(worker.ConvertItems(result.Feed.Items, *feed)) | ||||||
|  |  | ||||||
| 			icon, err := worker.FindFavicon(storedFeed.Link, storedFeed.FeedLink) | 			icon, err := worker.FindFavicon(feed.Link, feed.FeedLink) | ||||||
| 			if icon != nil { | 			if icon != nil { | ||||||
| 				s.db.UpdateFeedIcon(storedFeed.Id, icon) | 				s.db.UpdateFeedIcon(feed.Id, icon) | ||||||
| 			} | 			} | ||||||
| 			if err != nil { | 			if err != nil { | ||||||
| 				log.Printf("Failed to find favicon for %s (%d): %s", storedFeed.FeedLink, storedFeed.Id, err) | 				log.Printf("Failed to find favicon for %s (%d): %s", feed.FeedLink, feed.Id, err) | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
| 			c.JSON(http.StatusOK, map[string]string{"status": "success"}) | 			c.JSON(http.StatusOK, map[string]string{"status": "success"}) | ||||||
| 		} else if sources != nil { | 		default: | ||||||
| 			c.JSON(http.StatusOK, map[string]interface{}{"status": "multiple", "choice": sources}) |  | ||||||
| 		} else { |  | ||||||
| 			c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) | 			c.JSON(http.StatusOK, map[string]string{"status": "notfound"}) | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|   | |||||||
							
								
								
									
										52
									
								
								src/worker/client.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								src/worker/client.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | |||||||
|  | package worker | ||||||
|  |  | ||||||
|  | import ( | ||||||
|  | 	"net" | ||||||
|  | 	"net/http" | ||||||
|  | 	"time" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | type Client struct { | ||||||
|  | 	httpClient *http.Client | ||||||
|  | 	userAgent  string | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (c *Client) get(url string) (*http.Response, error) { | ||||||
|  | 	return c.getConditional(url, "", "") | ||||||
|  | } | ||||||
|  |  | ||||||
|  | func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) { | ||||||
|  | 	req, err := http.NewRequest("GET", url, nil) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil, err | ||||||
|  | 	} | ||||||
|  | 	req.Header.Set("User-Agent", c.userAgent) | ||||||
|  | 	if lastModified != "" { | ||||||
|  | 		req.Header.Set("If-Modified-Since", lastModified) | ||||||
|  | 	} | ||||||
|  | 	if etag != "" { | ||||||
|  | 		req.Header.Set("If-None-Match", etag) | ||||||
|  | 	} | ||||||
|  | 	return c.httpClient.Do(req) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | var client *Client | ||||||
|  |  | ||||||
|  | func init() { | ||||||
|  | 	transport := &http.Transport{ | ||||||
|  | 		Proxy: http.ProxyFromEnvironment, | ||||||
|  | 		DialContext: (&net.Dialer{ | ||||||
|  | 			Timeout: 10 * time.Second, | ||||||
|  | 		}).DialContext, | ||||||
|  | 		DisableKeepAlives:   true, | ||||||
|  | 		TLSHandshakeTimeout: time.Second * 10, | ||||||
|  | 	} | ||||||
|  | 	httpClient := &http.Client{ | ||||||
|  | 		Timeout:   time.Second * 30, | ||||||
|  | 		Transport: transport, | ||||||
|  | 	} | ||||||
|  | 	client = &Client{ | ||||||
|  | 		httpClient: httpClient, | ||||||
|  | 		userAgent:  "Yarr/1.0", | ||||||
|  | 	} | ||||||
|  | } | ||||||
| @@ -5,11 +5,8 @@ import ( | |||||||
| 	"errors" | 	"errors" | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"io/ioutil" | 	"io/ioutil" | ||||||
| 	"log" |  | ||||||
| 	"net" |  | ||||||
| 	"net/http" | 	"net/http" | ||||||
| 	"net/url" | 	"net/url" | ||||||
| 	"time" |  | ||||||
|  |  | ||||||
| 	"github.com/nkanaev/yarr/src/scraper" | 	"github.com/nkanaev/yarr/src/scraper" | ||||||
| 	"github.com/nkanaev/yarr/src/parser" | 	"github.com/nkanaev/yarr/src/parser" | ||||||
| @@ -22,102 +19,54 @@ type FeedSource struct { | |||||||
| 	Url   string `json:"url"` | 	Url   string `json:"url"` | ||||||
| } | } | ||||||
|  |  | ||||||
| type Client struct { | type DiscoverResult struct { | ||||||
| 	httpClient *http.Client | 	Feed     *parser.Feed | ||||||
| 	userAgent  string | 	FeedLink string | ||||||
|  | 	Sources  []FeedSource | ||||||
| } | } | ||||||
|  |  | ||||||
| func (c *Client) get(url string) (*http.Response, error) { | func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) { | ||||||
| 	req, err := http.NewRequest("GET", url, nil) | 	result := &DiscoverResult{} | ||||||
| 	if err != nil { |  | ||||||
| 		return nil, err |  | ||||||
| 	} |  | ||||||
| 	req.Header.Set("User-Agent", c.userAgent) |  | ||||||
| 	return c.httpClient.Do(req) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| func (c *Client) getConditional(url, lastModified, etag string) (*http.Response, error) { |  | ||||||
| 	req, err := http.NewRequest("GET", url, nil) |  | ||||||
| 	if err != nil { |  | ||||||
| 		return nil, err |  | ||||||
| 	} |  | ||||||
| 	req.Header.Set("User-Agent", c.userAgent) |  | ||||||
| 	if lastModified != "" { |  | ||||||
| 		req.Header.Set("If-Modified-Since", lastModified) |  | ||||||
| 	} |  | ||||||
| 	if etag != "" { |  | ||||||
| 		req.Header.Set("If-None-Match", etag) |  | ||||||
| 	} |  | ||||||
| 	return c.httpClient.Do(req) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| var defaultClient *Client |  | ||||||
|  |  | ||||||
| func searchFeedLinks(html []byte, siteurl string) ([]FeedSource, error) { |  | ||||||
| 	sources := make([]FeedSource, 0, 0) |  | ||||||
| 	for url, title := range scraper.FindFeeds(string(html), siteurl) { |  | ||||||
| 		sources = append(sources, FeedSource{Title: title, Url: url}) |  | ||||||
| 	} |  | ||||||
| 	return sources, nil |  | ||||||
| } |  | ||||||
|  |  | ||||||
| func DiscoverFeed(candidateUrl string) (*parser.Feed, string, *[]FeedSource, error) { |  | ||||||
| 	// Query URL | 	// Query URL | ||||||
| 	res, err := defaultClient.get(candidateUrl) | 	res, err := client.get(candidateUrl) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, "", nil, err | 		return nil, err | ||||||
| 	} | 	} | ||||||
| 	defer res.Body.Close() | 	defer res.Body.Close() | ||||||
| 	if res.StatusCode != 200 { | 	if res.StatusCode != 200 { | ||||||
| 		errmsg := fmt.Sprintf("Failed to fetch feed %s (status: %d)", candidateUrl, res.StatusCode) | 		return nil, fmt.Errorf("status code %d", res.StatusCode) | ||||||
| 		return nil, "", nil, errors.New(errmsg) |  | ||||||
| 	} | 	} | ||||||
| 	content, err := ioutil.ReadAll(res.Body) | 	content, err := ioutil.ReadAll(res.Body) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, "", nil, err | 		return nil, err | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Try to feed into parser | 	// Try to feed into parser | ||||||
| 	feed, err := parser.Parse(bytes.NewReader(content)) | 	feed, err := parser.Parse(bytes.NewReader(content)) | ||||||
| 	if err == nil { | 	if err == nil { | ||||||
| 		/* | 		feed.TranslateURLs(candidateUrl) | ||||||
| 		// WILD: feeds may not always have link to themselves | 		result.Feed = feed | ||||||
| 		if len(feed.FeedLink) == 0 { | 		result.FeedLink = candidateUrl | ||||||
| 			feed.FeedLink = candidateUrl | 		return result, nil | ||||||
| 		} |  | ||||||
| 		*/ |  | ||||||
|  |  | ||||||
| 		// WILD: resolve relative links (path, without host) |  | ||||||
| 		/* |  | ||||||
| 		base, _ := url.Parse(candidateUrl) |  | ||||||
| 		if link, err := url.Parse(feed.Link); err == nil && link.Host == "" { |  | ||||||
| 			feed.Link = base.ResolveReference(link).String() |  | ||||||
| 		} |  | ||||||
| 		if link, err := url.Parse(feed.FeedLink); err == nil && link.Host == "" { |  | ||||||
| 			feed.FeedLink = base.ResolveReference(link).String() |  | ||||||
| 		} |  | ||||||
| 		*/ |  | ||||||
| 		err := feed.TranslateURLs(candidateUrl) |  | ||||||
| 		if err != nil { |  | ||||||
| 			log.Printf("Failed to translate feed urls: %s", err) |  | ||||||
| 		} |  | ||||||
|  |  | ||||||
| 		return feed, candidateUrl, nil, nil |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Possibly an html link. Search for feed links | 	// Possibly an html link. Search for feed links | ||||||
| 	sources, err := searchFeedLinks(content, candidateUrl) | 	sources := make([]FeedSource, 0) | ||||||
| 	if err != nil { | 	for url, title := range scraper.FindFeeds(string(content), candidateUrl) { | ||||||
| 		return nil, "", nil, err | 		sources = append(sources, FeedSource{Title: title, Url: url}) | ||||||
| 	} else if len(sources) == 0 { | 	} | ||||||
| 		return nil, "", nil, errors.New("No feeds found at the given url") | 	switch { | ||||||
| 	} else if len(sources) == 1 { | 	case len(sources) == 0: | ||||||
|  | 		return nil, errors.New("No feeds found at the given url") | ||||||
|  | 	case len(sources) == 1: | ||||||
| 		if sources[0].Url == candidateUrl { | 		if sources[0].Url == candidateUrl { | ||||||
| 			return nil, "", nil, errors.New("Recursion!") | 			return nil, errors.New("Recursion!") | ||||||
| 		} | 		} | ||||||
| 		return DiscoverFeed(sources[0].Url) | 		return DiscoverFeed(sources[0].Url) | ||||||
| 	} | 	} | ||||||
| 	return nil, "", &sources, nil |  | ||||||
|  | 	result.Sources = sources | ||||||
|  | 	return result, nil | ||||||
| } | } | ||||||
|  |  | ||||||
| func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { | func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { | ||||||
| @@ -132,7 +81,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if len(websiteUrl) != 0 { | 	if len(websiteUrl) != 0 { | ||||||
| 		res, err := defaultClient.get(websiteUrl) | 		res, err := client.get(websiteUrl) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			return nil, err | 			return nil, err | ||||||
| 		} | 		} | ||||||
| @@ -157,7 +106,7 @@ func FindFavicon(websiteUrl, feedUrl string) (*[]byte, error) { | |||||||
| 		"image/gif", | 		"image/gif", | ||||||
| 	} | 	} | ||||||
| 	for _, url := range candidateUrls { | 	for _, url := range candidateUrls { | ||||||
| 		res, err := defaultClient.get(url) | 		res, err := client.get(url) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
| @@ -180,18 +129,6 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item { | |||||||
| 	result := make([]storage.Item, len(items)) | 	result := make([]storage.Item, len(items)) | ||||||
| 	for i, item := range items { | 	for i, item := range items { | ||||||
| 		item := item | 		item := item | ||||||
| 		podcastUrl := item.PodcastURL |  | ||||||
|  |  | ||||||
| 		/* |  | ||||||
| 		var podcastUrl *string |  | ||||||
| 		if item.Enclosures != nil { |  | ||||||
| 			for _, enclosure := range item.Enclosures { |  | ||||||
| 				if strings.ToLower(enclosure.Type) == "audio/mpeg" { |  | ||||||
| 					podcastUrl = &enclosure.URL |  | ||||||
| 				} |  | ||||||
| 			} |  | ||||||
| 		} |  | ||||||
| 		*/ |  | ||||||
| 		result[i] = storage.Item{ | 		result[i] = storage.Item{ | ||||||
| 			GUID:        item.GUID, | 			GUID:        item.GUID, | ||||||
| 			FeedId:      feed.Id, | 			FeedId:      feed.Id, | ||||||
| @@ -203,33 +140,30 @@ func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item { | |||||||
| 			Date:        &item.Date, | 			Date:        &item.Date, | ||||||
| 			Status:      storage.UNREAD, | 			Status:      storage.UNREAD, | ||||||
| 			Image:       item.ImageURL, | 			Image:       item.ImageURL, | ||||||
| 			PodcastURL:  &podcastUrl, | 			PodcastURL:  nil, | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	return result | 	return result | ||||||
| } | } | ||||||
|  |  | ||||||
| func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) { | func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) { | ||||||
| 	var res *http.Response | 	lmod := "" | ||||||
| 	var err error | 	etag := "" | ||||||
|  | 	if state := db.GetHTTPState(f.Id); state != nil { | ||||||
| 	httpState := db.GetHTTPState(f.Id) | 		lmod = state.LastModified | ||||||
| 	if httpState != nil { | 		etag = state.Etag | ||||||
| 		res, err = defaultClient.getConditional(f.FeedLink, httpState.LastModified, httpState.Etag) |  | ||||||
| 	} else { |  | ||||||
| 		res, err = defaultClient.get(f.FeedLink) |  | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	res, err := client.getConditional(f.FeedLink, lmod, etag) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, fmt.Errorf("unable to get: %s", err) | 		return nil, fmt.Errorf("unable to get: %s", err) | ||||||
| 	} | 	} | ||||||
| 	defer res.Body.Close() | 	defer res.Body.Close() | ||||||
|  |  | ||||||
| 	if res.StatusCode/100 == 4 || res.StatusCode/100 == 5 { | 	switch { | ||||||
|  | 	case res.StatusCode < 200 || res.StatusCode > 399: | ||||||
| 		return nil, fmt.Errorf("status code %d", res.StatusCode) | 		return nil, fmt.Errorf("status code %d", res.StatusCode) | ||||||
| 	} | 	case res.StatusCode == http.StatusNotModified: | ||||||
|  |  | ||||||
| 	if res.StatusCode == 304 { |  | ||||||
| 		return nil, nil | 		return nil, nil | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| @@ -237,34 +171,17 @@ func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) { | |||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, fmt.Errorf("failed to init response body: %s", err) | 		return nil, fmt.Errorf("failed to init response body: %s", err) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	feed, err := parser.Parse(body) | 	feed, err := parser.Parse(body) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, fmt.Errorf("failed to parse: %s", err) | 		return nil, fmt.Errorf("failed to parse: %s", err) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	lastModified := res.Header.Get("Last-Modified") | 	lmod = res.Header.Get("Last-Modified") | ||||||
| 	etag := res.Header.Get("Etag") | 	etag = res.Header.Get("Etag") | ||||||
| 	if lastModified != "" || etag != "" { | 	if lmod != "" || etag != "" { | ||||||
| 		db.SetHTTPState(f.Id, lastModified, etag) | 		db.SetHTTPState(f.Id, lmod, etag) | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	return ConvertItems(feed.Items, f), nil | 	return ConvertItems(feed.Items, f), nil | ||||||
| } | } | ||||||
|  |  | ||||||
| func init() { |  | ||||||
| 	transport := &http.Transport{ |  | ||||||
| 		Proxy: http.ProxyFromEnvironment, |  | ||||||
| 		DialContext: (&net.Dialer{ |  | ||||||
| 			Timeout: 10 * time.Second, |  | ||||||
| 		}).DialContext, |  | ||||||
| 		DisableKeepAlives:   true, |  | ||||||
| 		TLSHandshakeTimeout: time.Second * 10, |  | ||||||
| 	} |  | ||||||
| 	httpClient := &http.Client{ |  | ||||||
| 		Timeout:   time.Second * 30, |  | ||||||
| 		Transport: transport, |  | ||||||
| 	} |  | ||||||
| 	defaultClient = &Client{ |  | ||||||
| 		httpClient: httpClient, |  | ||||||
| 		userAgent:  "Yarr/1.0", |  | ||||||
| 	} |  | ||||||
| } |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user