From 83c6f807cb451daa29603e0412f54ff715f71c39 Mon Sep 17 00:00:00 2001 From: WGH Date: Fri, 28 Jan 2022 15:27:23 +0300 Subject: [PATCH 1/3] Use http.NewRequest instead of &http.Request{...} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit http.NewRequest already does exactly what setRequestBody did. In fact, scrape and setRequestBody used to contain parts of http.NewRequest implementation. 119df7afac implies that NewRequest is slower than assembling http.Request object manually, likely because it avoids URL reparsing. Indeed, this change brings some slowdown: name old time/op new time/op delta OnHTML-24 129µs ± 0% 131µs ± 1% +1.69% (p=0.029 n=4+4) OnXML-24 130µs ± 0% 132µs ± 1% +1.69% (p=0.029 n=4+4) OnResponse-24 93.4µs ± 0% 95.5µs ± 1% +2.30% (p=0.029 n=4+4) However, the benchmarks are somewhat unrealistic as the HTML page is extremely simple, the web server runs on localhost, and the data is discarded (rather than e.g. stored somewhere). In practice, impact of this is going to be neglible, and code complexity and code duplication is not really worth it. --- colly.go | 54 +++++------------------------------------------------- 1 file changed, 5 insertions(+), 49 deletions(-) diff --git a/colly.go b/colly.go index decfd31f..9e6e814a 100644 --- a/colly.go +++ b/colly.go @@ -24,7 +24,6 @@ import ( "fmt" "hash/fnv" "io" - "io/ioutil" "log" "net/http" "net/http/cookiejar" @@ -572,30 +571,19 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c if _, ok := hdr["User-Agent"]; !ok { hdr.Set("User-Agent", c.UserAgent) } - rc, ok := requestData.(io.ReadCloser) - if !ok && requestData != nil { - rc = ioutil.NopCloser(requestData) + req, err := http.NewRequest(method, parsedURL.String(), requestData) + if err != nil { + return err } + req.Header = hdr // The Go HTTP API ignores "Host" in the headers, preferring the client // to use the Host field on Request. - host := parsedURL.Host if hostHeader := hdr.Get("Host"); hostHeader != "" { - host = hostHeader - } - req := &http.Request{ - Method: method, - URL: parsedURL, - Proto: "HTTP/1.1", - ProtoMajor: 1, - ProtoMinor: 1, - Header: hdr, - Body: rc, - Host: host, + req.Host = hostHeader } // note: once 1.13 is minimum supported Go version, // replace this with http.NewRequestWithContext req = req.WithContext(c.Context) - setRequestBody(req, requestData) u = parsedURL.String() c.wg.Add(1) if c.Async { @@ -605,38 +593,6 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c return c.fetch(u, method, depth, requestData, ctx, hdr, req) } -func setRequestBody(req *http.Request, body io.Reader) { - if body != nil { - switch v := body.(type) { - case *bytes.Buffer: - req.ContentLength = int64(v.Len()) - buf := v.Bytes() - req.GetBody = func() (io.ReadCloser, error) { - r := bytes.NewReader(buf) - return ioutil.NopCloser(r), nil - } - case *bytes.Reader: - req.ContentLength = int64(v.Len()) - snapshot := *v - req.GetBody = func() (io.ReadCloser, error) { - r := snapshot - return ioutil.NopCloser(&r), nil - } - case *strings.Reader: - req.ContentLength = int64(v.Len()) - snapshot := *v - req.GetBody = func() (io.ReadCloser, error) { - r := snapshot - return ioutil.NopCloser(&r), nil - } - } - if req.GetBody != nil && req.ContentLength == 0 { - req.Body = http.NoBody - req.GetBody = func() (io.ReadCloser, error) { return http.NoBody, nil } - } - } -} - func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error { defer c.wg.Done() if ctx == nil { From e3a23079520e6ec6356f62fed77acf73646a75d4 Mon Sep 17 00:00:00 2001 From: WGH Date: Fri, 28 Jan 2022 16:12:26 +0300 Subject: [PATCH 2/3] Use request.GetBody for request hashing streamToByte has an ugly type switch that tries to rewind the body so it could be read again. However, http.NewRequest already does something similar: when Body type allows, GetBody is populated so request can be retried. So just use it instead of trying to reimplement the body rewinding, and remove streamToByte function altogether. --- colly.go | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/colly.go b/colly.go index 9e6e814a..76f4ef0c 100644 --- a/colly.go +++ b/colly.go @@ -561,10 +561,6 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c if err != nil { return err } - if err := c.requestCheck(u, parsedURL, method, requestData, depth, checkRevisit); err != nil { - return err - } - if hdr == nil { hdr = http.Header{} } @@ -584,6 +580,9 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c // note: once 1.13 is minimum supported Go version, // replace this with http.NewRequestWithContext req = req.WithContext(c.Context) + if err := c.requestCheck(u, parsedURL, method, req.GetBody, depth, checkRevisit); err != nil { + return err + } u = parsedURL.String() c.wg.Add(1) if c.Async { @@ -671,7 +670,7 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct return err } -func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, requestData io.Reader, depth int, checkRevisit bool) error { +func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, getBody func() (io.ReadCloser, error), depth int, checkRevisit bool) error { if u == "" { return ErrMissingURL } @@ -693,8 +692,13 @@ func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, re var uHash uint64 if method == "GET" { uHash = h.Sum64() - } else if requestData != nil { - h.Write(streamToByte(requestData)) + } else if getBody != nil { + r, err := getBody() + if err != nil { + return err + } + defer r.Close() + io.Copy(h, r) uHash = h.Sum64() } else { return nil @@ -1303,7 +1307,7 @@ func (c *Collector) checkHasVisited(URL string, requestData map[string]string) ( h.Write([]byte(URL)) if requestData != nil { - h.Write(streamToByte(createFormReader(requestData))) + io.Copy(h, createFormReader(requestData)) } return c.store.IsVisited(h.Sum64()) @@ -1417,16 +1421,3 @@ func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool { } return false } - -func streamToByte(r io.Reader) []byte { - buf := new(bytes.Buffer) - buf.ReadFrom(r) - - if strReader, k := r.(*strings.Reader); k { - strReader.Seek(0, 0) - } else if bReader, kb := r.(*bytes.Reader); kb { - bReader.Seek(0, 0) - } - - return buf.Bytes() -} From 4e8e0d7b695b99e2344c6fa2fef0a969664fe030 Mon Sep 17 00:00:00 2001 From: WGH Date: Fri, 28 Jan 2022 16:39:32 +0300 Subject: [PATCH 3/3] Refactor body hashing The algorithm - specific hash function and what data is hashed - was duplicated in several locations. Move it to separate function. --- colly.go | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/colly.go b/colly.go index 76f4ef0c..420f37da 100644 --- a/colly.go +++ b/colly.go @@ -686,24 +686,23 @@ func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, ge } } if checkRevisit && !c.AllowURLRevisit { - h := fnv.New64a() - h.Write([]byte(u)) - - var uHash uint64 - if method == "GET" { - uHash = h.Sum64() - } else if getBody != nil { - r, err := getBody() + // TODO weird behaviour, it allows CheckHead to work correctly, + // but it should probably better be solved with + // "check-but-not-save" flag or something + if method != "GET" && getBody == nil { + return nil + } + + var body io.ReadCloser + if getBody != nil { + var err error + body, err = getBody() if err != nil { return err } - defer r.Close() - io.Copy(h, r) - uHash = h.Sum64() - } else { - return nil + defer body.Close() } - + uHash := requestHash(u, body) visited, err := c.store.IsVisited(uHash) if err != nil { return err @@ -1303,14 +1302,8 @@ func (c *Collector) parseSettingsFromEnv() { } func (c *Collector) checkHasVisited(URL string, requestData map[string]string) (bool, error) { - h := fnv.New64a() - h.Write([]byte(URL)) - - if requestData != nil { - io.Copy(h, createFormReader(requestData)) - } - - return c.store.IsVisited(h.Sum64()) + hash := requestHash(URL, createFormReader(requestData)) + return c.store.IsVisited(hash) } // SanitizeFileName replaces dangerous characters in a string @@ -1421,3 +1414,12 @@ func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool { } return false } + +func requestHash(url string, body io.Reader) uint64 { + h := fnv.New64a() + h.Write([]byte(url)) + if body != nil { + io.Copy(h, body) + } + return h.Sum64() +}