Skip to content

Commit a5112e7

Browse files
committed
Add MaxDocumentLength and custom UserAgent support
1 parent 36995ce commit a5112e7

File tree

1 file changed

+45
-4
lines changed

1 file changed

+45
-4
lines changed

goscraper.go

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package goscraper
22

33
import (
44
"bytes"
5+
"errors"
56
"fmt"
67
"io"
78
"net/http"
@@ -18,10 +19,16 @@ var (
1819
fragmentRegexp = regexp.MustCompile("#!(.*)")
1920
)
2021

22+
type ScraperOptions struct {
23+
MaxDocumentLength int64
24+
UserAgent string
25+
}
26+
2127
type Scraper struct {
2228
Url *url.URL
2329
EscapedFragmentUrl *url.URL
2430
MaxRedirect int
31+
Options ScraperOptions
2532
}
2633

2734
type Document struct {
@@ -38,12 +45,12 @@ type DocumentPreview struct {
3845
Link string
3946
}
4047

41-
func Scrape(uri string, maxRedirect int) (*Document, error) {
48+
func Scrape(uri string, maxRedirect int, options ScraperOptions) (*Document, error) {
4249
u, err := url.Parse(uri)
4350
if err != nil {
4451
return nil, err
4552
}
46-
return (&Scraper{Url: u, MaxRedirect: maxRedirect}).Scrape()
53+
return (&Scraper{Url: u, MaxRedirect: maxRedirect, Options: options}).Scrape()
4754
}
4855

4956
func (scraper *Scraper) Scrape() (*Document, error) {
@@ -109,6 +116,16 @@ func (scraper *Scraper) toFragmentUrl() error {
109116
}
110117

111118
func (scraper *Scraper) getDocument() (*Document, error) {
119+
addUserAgent := func(req *http.Request) *http.Request {
120+
userAgent := "GoScraper"
121+
if len(scraper.Options.UserAgent) != 0 {
122+
userAgent = scraper.Options.UserAgent
123+
}
124+
req.Header.Add("User-Agent", userAgent)
125+
126+
return req
127+
}
128+
112129
scraper.MaxRedirect -= 1
113130
if strings.Contains(scraper.Url.String(), "#!") {
114131
scraper.toFragmentUrl()
@@ -117,11 +134,30 @@ func (scraper *Scraper) getDocument() (*Document, error) {
117134
scraper.EscapedFragmentUrl = scraper.Url
118135
}
119136

137+
if scraper.Options.MaxDocumentLength != 0 {
138+
req, err := http.NewRequest("HEAD", scraper.getUrl(), nil)
139+
if err != nil {
140+
return nil, err
141+
}
142+
req = addUserAgent(req)
143+
144+
resp, err := http.DefaultClient.Do(req)
145+
if resp != nil {
146+
defer resp.Body.Close()
147+
}
148+
if err != nil {
149+
return nil, err
150+
}
151+
if resp.ContentLength > scraper.Options.MaxDocumentLength {
152+
errors.New("Content-Length exceed limits")
153+
}
154+
}
155+
120156
req, err := http.NewRequest("GET", scraper.getUrl(), nil)
121157
if err != nil {
122158
return nil, err
123159
}
124-
req.Header.Add("User-Agent", "GoScraper")
160+
req = addUserAgent(req)
125161

126162
resp, err := http.DefaultClient.Do(req)
127163
if resp != nil {
@@ -135,6 +171,11 @@ func (scraper *Scraper) getDocument() (*Document, error) {
135171
scraper.EscapedFragmentUrl = nil
136172
scraper.Url = resp.Request.URL
137173
}
174+
175+
if scraper.Options.MaxDocumentLength > 0 {
176+
resp.Body = http.MaxBytesReader(nil, resp.Body, scraper.Options.MaxDocumentLength)
177+
}
178+
138179
b, err := convertUTF8(resp.Body, resp.Header.Get("content-type"))
139180
if err != nil {
140181
return nil, err
@@ -197,7 +238,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
197238
if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "canonical" {
198239
canonical = true
199240
}
200-
if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val), "icon") {
241+
if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val), "icon") {
201242
hasIcon = true
202243
}
203244
if cleanStr(attr.Key) == "href" {

0 commit comments

Comments
 (0)