Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ var envMap = map[string]func(*Collector, string){
},
}

var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())

// NewCollector creates a new Collector instance with default configuration
func NewCollector(options ...CollectorOption) *Collector {
c := &Collector{}
Expand Down Expand Up @@ -550,7 +552,7 @@ func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
}

func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {
parsedWhatwgURL, err := whatwgUrl.Parse(u)
parsedWhatwgURL, err := urlParser.Parse(u)
if err != nil {
return err
}
Expand Down Expand Up @@ -1082,7 +1084,7 @@ func (c *Collector) handleOnHTML(resp *Response) error {
return err
}
if href, found := doc.Find("base[href]").Attr("href"); found {
u, err := whatwgUrl.ParseRef(resp.Request.URL.String(), href)
u, err := urlParser.ParseRef(resp.Request.URL.String(), href)
if err == nil {
baseURL, err := url.Parse(u.Href(false))
if err == nil {
Expand Down
37 changes: 37 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ y">link</a>
`))
})

mux.HandleFunc("/100%25", func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("100 percent"))
})

mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/octet-stream")
ww := bufio.NewWriter(w)
Expand Down Expand Up @@ -914,6 +918,39 @@ func TestTabsAndNewlines(t *testing.T) {
}
}

func TestLonePercent(t *testing.T) {
ts := newTestServer()
defer ts.Close()

var visitedPath string

c := NewCollector()
c.OnResponse(func(res *Response) {
visitedPath = res.Request.URL.RequestURI()
})
if err := c.Visit(ts.URL + "/100%"); err != nil {
t.Errorf("visit failed: %v", err)
}
// Automatic encoding is not really correct: browsers
// would send bare percent here. However, Go net/http
// cannot send such requests due to
// https://github.com/golang/go/issues/29808. So we have two
// alternatives really: return an error when attempting
// to fetch such URLs, or at least try the encoded variant.
// This test checks that the latter is attempted.
if got, want := visitedPath, "/100%25"; got != want {
t.Errorf("got=%q want=%q", got, want)
}
// invalid URL escape in query component is not a problem,
// but check it anyway
if err := c.Visit(ts.URL + "/?a=100%zz"); err != nil {
t.Errorf("visit failed: %v", err)
}
if got, want := visitedPath, "/?a=100%zz"; got != want {
t.Errorf("got=%q want=%q", got, want)
}
}

func TestCollectorCookies(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down
4 changes: 3 additions & 1 deletion queue/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (

const stop = true

var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())

// Storage is the interface of the queue's storage backend
// Storage must be concurrently safe for multiple goroutines.
type Storage interface {
Expand Down Expand Up @@ -77,7 +79,7 @@ func (q *Queue) IsEmpty() bool {

// AddURL adds a new URL to the queue
func (q *Queue) AddURL(URL string) error {
u, err := whatwgUrl.Parse(URL)
u, err := urlParser.Parse(URL)
if err != nil {
return err
}
Expand Down
6 changes: 2 additions & 4 deletions request.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ import (
"net/url"
"strings"
"sync/atomic"

whatwgUrl "github.com/nlnwa/whatwg-url/url"
)

// Request is the representation of a HTTP request made by a Collector
Expand Down Expand Up @@ -66,7 +64,7 @@ type serializableRequest struct {

// New creates a new request with the context of the original request
func (r *Request) New(method, URL string, body io.Reader) (*Request, error) {
u, err := whatwgUrl.Parse(URL)
u, err := urlParser.Parse(URL)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -104,7 +102,7 @@ func (r *Request) AbsoluteURL(u string) string {
base = r.URL
}

absURL, err := whatwgUrl.ParseRef(base.String(), u)
absURL, err := urlParser.ParseRef(base.String(), u)
if err != nil {
return ""
}
Expand Down