Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@ type Collector struct {
CheckHead bool
// TraceHTTP enables capturing and reporting request performance for crawler tuning.
// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
TraceHTTP bool
TraceHTTP bool
// Context is the context that will be used for HTTP requests. You can set this
// to support clean cancellation of scraping.
Context context.Context

store storage.Storage
debugger debug.Debugger
robotsMap map[string]*robotstxt.RobotsData
Expand Down Expand Up @@ -357,6 +361,14 @@ func TraceHTTP() CollectorOption {
}
}

// StdlibContext sets the context that will be used for HTTP requests.
// You can set this to support clean cancellation of scraping.
func StdlibContext(ctx context.Context) CollectorOption {
return func(c *Collector) {
c.Context = ctx
}
}

// ID sets the unique identifier of the Collector.
func ID(id uint32) CollectorOption {
return func(c *Collector) {
Expand Down Expand Up @@ -412,6 +424,7 @@ func (c *Collector) Init() {
c.IgnoreRobotsTxt = true
c.ID = atomic.AddUint32(&collectorCounter, 1)
c.TraceHTTP = false
c.Context = context.Background()
}

// Appengine will replace the Collector's backend http.Client
Expand Down Expand Up @@ -567,6 +580,9 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c
Body: rc,
Host: host,
}
// note: once 1.13 is minimum supported Go version,
// replace this with http.NewRequestWithContext
req = req.WithContext(c.Context)
setRequestBody(req, requestData)
u = parsedURL.String()
c.wg.Add(1)
Expand Down Expand Up @@ -1239,6 +1255,7 @@ func (c *Collector) Clone() *Collector {
ParseHTTPErrorResponse: c.ParseHTTPErrorResponse,
UserAgent: c.UserAgent,
TraceHTTP: c.TraceHTTP,
Context: c.Context,
store: c.store,
backend: c.backend,
debugger: c.debugger,
Expand Down
64 changes: 64 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package colly
import (
"bufio"
"bytes"
"context"
"fmt"
"net/http"
"net/http/httptest"
Expand All @@ -26,6 +27,7 @@ import (
"regexp"
"strings"
"testing"
"time"

"github.com/PuerkitoBio/goquery"

Expand Down Expand Up @@ -166,6 +168,31 @@ func newTestServer() *httptest.Server {
}
})

mux.HandleFunc("/slow", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)

ticker := time.NewTicker(100 * time.Millisecond)
defer ticker.Stop()

i := 0

for {
select {
case <-r.Context().Done():
return
case t := <-ticker.C:
fmt.Fprintf(w, "%s\n", t)
if flusher, ok := w.(http.Flusher); ok {
flusher.Flush()
}
i++
if i == 10 {
return
}
}
}
})

return httptest.NewServer(mux)
}

Expand Down Expand Up @@ -1128,6 +1155,43 @@ func TestCollectorDepth(t *testing.T) {
}
}

func TestCollectorContext(t *testing.T) {
// "/slow" takes 1 second to return the response.
// If context does abort the transfer after 0.5 seconds as it should,
// OnError will be called, and the test is passed. Otherwise, test is failed.

ts := newTestServer()
defer ts.Close()

ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()

c := NewCollector(StdlibContext(ctx))

onErrorCalled := false

c.OnResponse(func(resp *Response) {
t.Error("OnResponse was called, expected OnError")
})

c.OnError(func(resp *Response, err error) {
onErrorCalled = true
if err != context.DeadlineExceeded {
t.Errorf("OnError got err=%#v, expected context.DeadlineExceeded", err)
}
})

err := c.Visit(ts.URL + "/slow")
if err != context.DeadlineExceeded {
t.Errorf("Visit return err=%#v, expected context.DeadlineExceeded", err)
}

if !onErrorCalled {
t.Error("OnError was not called")
}

}

func BenchmarkOnHTML(b *testing.B) {
ts := newTestServer()
defer ts.Close()
Expand Down