@@ -2,6 +2,7 @@ package goscraper
22
33import (
44 "bytes"
5+ "errors"
56 "fmt"
67 "io"
78 "net/http"
@@ -18,10 +19,16 @@ var (
1819 fragmentRegexp = regexp .MustCompile ("#!(.*)" )
1920)
2021
22+ type ScraperOptions struct {
23+ MaxDocumentLength int64
24+ UserAgent string
25+ }
26+
2127type Scraper struct {
2228 Url * url.URL
2329 EscapedFragmentUrl * url.URL
2430 MaxRedirect int
31+ Options ScraperOptions
2532}
2633
2734type Document struct {
@@ -38,12 +45,12 @@ type DocumentPreview struct {
3845 Link string
3946}
4047
41- func Scrape (uri string , maxRedirect int ) (* Document , error ) {
48+ func Scrape (uri string , maxRedirect int , options ScraperOptions ) (* Document , error ) {
4249 u , err := url .Parse (uri )
4350 if err != nil {
4451 return nil , err
4552 }
46- return (& Scraper {Url : u , MaxRedirect : maxRedirect }).Scrape ()
53+ return (& Scraper {Url : u , MaxRedirect : maxRedirect , Options : options }).Scrape ()
4754}
4855
4956func (scraper * Scraper ) Scrape () (* Document , error ) {
@@ -109,6 +116,16 @@ func (scraper *Scraper) toFragmentUrl() error {
109116}
110117
111118func (scraper * Scraper ) getDocument () (* Document , error ) {
119+ addUserAgent := func (req * http.Request ) * http.Request {
120+ userAgent := "GoScraper"
121+ if len (scraper .Options .UserAgent ) != 0 {
122+ userAgent = scraper .Options .UserAgent
123+ }
124+ req .Header .Add ("User-Agent" , userAgent )
125+
126+ return req
127+ }
128+
112129 scraper .MaxRedirect -= 1
113130 if strings .Contains (scraper .Url .String (), "#!" ) {
114131 scraper .toFragmentUrl ()
@@ -117,11 +134,30 @@ func (scraper *Scraper) getDocument() (*Document, error) {
117134 scraper .EscapedFragmentUrl = scraper .Url
118135 }
119136
137+ if scraper .Options .MaxDocumentLength != 0 {
138+ req , err := http .NewRequest ("HEAD" , scraper .getUrl (), nil )
139+ if err != nil {
140+ return nil , err
141+ }
142+ req = addUserAgent (req )
143+
144+ resp , err := http .DefaultClient .Do (req )
145+ if resp != nil {
146+ defer resp .Body .Close ()
147+ }
148+ if err != nil {
149+ return nil , err
150+ }
151+ if resp .ContentLength > scraper .Options .MaxDocumentLength {
152+ errors .New ("Content-Length exceed limits" )
153+ }
154+ }
155+
120156 req , err := http .NewRequest ("GET" , scraper .getUrl (), nil )
121157 if err != nil {
122158 return nil , err
123159 }
124- req . Header . Add ( "User-Agent" , "GoScraper" )
160+ req = addUserAgent ( req )
125161
126162 resp , err := http .DefaultClient .Do (req )
127163 if resp != nil {
@@ -135,6 +171,11 @@ func (scraper *Scraper) getDocument() (*Document, error) {
135171 scraper .EscapedFragmentUrl = nil
136172 scraper .Url = resp .Request .URL
137173 }
174+
175+ if scraper .Options .MaxDocumentLength > 0 {
176+ resp .Body = http .MaxBytesReader (nil , resp .Body , scraper .Options .MaxDocumentLength )
177+ }
178+
138179 b , err := convertUTF8 (resp .Body , resp .Header .Get ("content-type" ))
139180 if err != nil {
140181 return nil , err
@@ -197,7 +238,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
197238 if cleanStr (attr .Key ) == "rel" && cleanStr (attr .Val ) == "canonical" {
198239 canonical = true
199240 }
200- if cleanStr (attr .Key ) == "rel" && strings .Contains (cleanStr (attr .Val ), "icon" ) {
241+ if cleanStr (attr .Key ) == "rel" && strings .Contains (cleanStr (attr .Val ), "icon" ) {
201242 hasIcon = true
202243 }
203244 if cleanStr (attr .Key ) == "href" {
0 commit comments