Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 9 additions & 16 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- '**'
pull_request:

jobs:
test:
Expand All @@ -14,13 +15,9 @@ jobs:
max-parallel: 4
matrix:
go: [
"1.19",
"1.18",
"1.17",
"1.16",
"1.15",
"1.14",
"1.13",
"1.12",
"1.11"
]

steps:
Expand All @@ -34,10 +31,10 @@ jobs:

- name: Test
run: |
go get -u golang.org/x/lint/golint
go install golang.org/x/lint/golint@latest
OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
OUT="$(golint ./...)"; test -z "$OUT" || (echo "$OUT" && return 1)
golint -set_exit_status
go vet -v ./...
go test -race -v -coverprofile=coverage.txt -covermode=atomic ./

Expand All @@ -49,13 +46,9 @@ jobs:
max-parallel: 4
matrix:
go: [
"1.19",
"1.18",
"1.17",
"1.16",
"1.15",
"1.14",
"1.13",
"1.12",
"1.11"
]

steps:
Expand All @@ -69,10 +62,10 @@ jobs:

- name: Build
run: |
go get -u golang.org/x/lint/golint
go install golang.org/x/lint/golint@latest
OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
OUT="$(golint ./...)"; test -z "$OUT" || (echo "$OUT" && return 1)
golint -set_exit_status
go build

codecov:
Expand Down
15 changes: 8 additions & 7 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -469,13 +469,14 @@ func (c *Collector) Init() {
// With an Http.Client that is provided by appengine/urlfetch
// This function should be used when the scraper is run on
// Google App Engine. Example:
// func startScraper(w http.ResponseWriter, r *http.Request) {
// ctx := appengine.NewContext(r)
// c := colly.NewCollector()
// c.Appengine(ctx)
// ...
// c.Visit("https://google.ca")
// }
//
// func startScraper(w http.ResponseWriter, r *http.Request) {
// ctx := appengine.NewContext(r)
// c := colly.NewCollector()
// c.Appengine(ctx)
// ...
// c.Visit("https://google.ca")
// }
func (c *Collector) Appengine(ctx context.Context) {
client := urlfetch.Client(ctx)
client.Jar = c.backend.Client.Jar
Expand Down
6 changes: 6 additions & 0 deletions extensions/random_user_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ var osStrings = []string{
}

// Generates Firefox Browser User-Agent (Desktop)
//
// -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:87.0) Gecko/20100101 Firefox/87.0"
func genFirefoxUA() string {
version := ffVersions[rand.Intn(len(ffVersions))]
Expand All @@ -253,6 +254,7 @@ func genFirefoxUA() string {
}

// Generates Chrome Browser User-Agent (Desktop)
//
// -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
func genChromeUA() string {
version := chromeVersions[rand.Intn(len(chromeVersions))]
Expand All @@ -261,6 +263,7 @@ func genChromeUA() string {
}

// Generates Microsoft Edge User-Agent (Desktop)
//
// -> "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.39"
func genEdgeUA() string {
version := edgeVersions[rand.Intn(len(edgeVersions))]
Expand All @@ -271,6 +274,7 @@ func genEdgeUA() string {
}

// Generates Opera Browser User-Agent (Desktop)
//
// -> "Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.8.131 Version/11.11"
func genOperaUA() string {
version := operaVersions[rand.Intn(len(operaVersions))]
Expand All @@ -279,6 +283,7 @@ func genOperaUA() string {
}

// Generates UCWEB/Nokia203 Browser User-Agent (Mobile)
//
// -> "UCWEB/2.0 (Java; U; MIDP-2.0; Nokia203/20.37) U2/1.0.0 UCMini/10.9.8.1006 (SpeedMode; Proxy; Android 4.4.4; SM-J110H ) U2/1.0.0 Mobile"
func genMobileUcwebUA() string {
device := ucwebDevices[rand.Intn(len(ucwebDevices))]
Expand All @@ -288,6 +293,7 @@ func genMobileUcwebUA() string {
}

// Generates Nexus 10 Browser User-Agent (Mobile)
//
// -> "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 10 Build/LMY48T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.91 Safari/537.36"
func genMobileNexus10UA() string {
build := nexus10Builds[rand.Intn(len(nexus10Builds))]
Expand Down
4 changes: 2 additions & 2 deletions http_backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header
// Both DomainRegexp and DomainGlob can be used to specify
// the included domains patterns, but at least one is required.
// There can be two kind of limitations:
// - Parallelism: Set limit for the number of concurrent requests to matching domains
// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
// - Parallelism: Set limit for the number of concurrent requests to matching domains
// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
type LimitRule struct {
// DomainRegexp is a regular expression to match against domains
DomainRegexp string
Expand Down
14 changes: 7 additions & 7 deletions unmarshal.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,17 @@ func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]strin
// UnmarshalHTML declaratively extracts text or attributes to a struct from
// HTML response using struct tags composed of css selectors.
// Allowed struct tags:
// - "selector" (required): CSS (goquery) selector of the desired data
// - "attr" (optional): Selects the matching element's attribute's value.
// - "selector" (required): CSS (goquery) selector of the desired data
// - "attr" (optional): Selects the matching element's attribute's value.
// Leave it blank or omit to get the text of the element.
//
// Example struct declaration:
//
// type Nested struct {
// String string `selector:"div > p"`
// Classes []string `selector:"li" attr:"class"`
// Struct *Nested `selector:"div > div"`
// }
// type Nested struct {
// String string `selector:"div > p"`
// Classes []string `selector:"li" attr:"class"`
// Struct *Nested `selector:"div > div"`
// }
//
// Supported types: struct, *struct, string, []string
func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error {
Expand Down