Skip to content

Commit

Permalink
Use remote scraper only if local scraper failed
Browse files Browse the repository at this point in the history
  • Loading branch information
Wikidepia committed Jul 23, 2024
1 parent 7b067c9 commit 377bf0f
Showing 1 changed file with 54 additions and 33 deletions.
87 changes: 54 additions & 33 deletions handlers/scraper/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"instafix/utils"
"io"
"log/slog"
"net"
"net/http"
"net/url"
"strconv"
Expand All @@ -27,9 +28,21 @@ import (
)

var (
timeout = 10 * time.Second
ErrNotFound = errors.New("post not found")
transport = gzhttp.Transport(http.DefaultTransport, gzhttp.TransportAlwaysDecompress(true))
timeout = 10 * time.Second
ErrNotFound = errors.New("post not found")
transport = gzhttp.Transport(http.DefaultTransport, gzhttp.TransportAlwaysDecompress(true))
transportNoProxy = &http.Transport{
Proxy: nil, // Skip any proxy
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
ForceAttemptHTTP2: true,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
}
)

var RemoteScraperAddr string
Expand Down Expand Up @@ -136,36 +149,6 @@ func GetData(postID string) (*InstaData, error) {
func (i *InstaData) ScrapeData() error {
client := http.Client{Transport: transport, Timeout: timeout}

// Scrape from remote scraper if available
if len(RemoteScraperAddr) > 0 {
var err error
req, err := http.NewRequest("GET", RemoteScraperAddr+"/scrape/"+i.PostID, nil)
if err != nil {
return err
}
req.Header.Set("Accept-Encoding", "zstd.dict")
res, err := client.Do(req)
if res != nil && res.StatusCode == 200 {
defer res.Body.Close()
zstdReader, err := zstd.NewReader(nil, zstd.WithDecoderLowmem(true), zstd.WithDecoderDicts(zstdDict))
if err != nil {
return err
}
remoteData, err := io.ReadAll(res.Body)
if err == nil {
remoteDecomp, err := zstdReader.DecodeAll(remoteData, nil)
if err != nil {
return err
}
if err = binary.Unmarshal(remoteDecomp, i); err == nil {
slog.Info("Data parsed from remote scraper", "postID", i.PostID)
return nil
}
}
slog.Error("Failed to scrape data from remote scraper", "postID", i.PostID, "status", res.StatusCode, "err", err)
}
}

req, err := http.NewRequest("GET", "https://www.instagram.com/p/"+i.PostID+"/embed/captioned/", nil)
if err != nil {
return err
Expand Down Expand Up @@ -236,6 +219,11 @@ func (i *InstaData) ScrapeData() error {
if gqlValue != nil && !strings.Contains(utils.B2S(gqlValue), "require_login") {
gqlData = gjson.Parse(utils.B2S(gqlValue)).Get("data")
slog.Info("Data parsed from GraphQL API", "postID", i.PostID)
} else if len(RemoteScraperAddr) > 0 {
// Scrape from remote scraper if available and failed to scrape from GraphQL API
if err := i.ScrapeRemoteData(); err == nil {
return nil
}
}
}

Expand Down Expand Up @@ -297,6 +285,39 @@ func (i *InstaData) ScrapeData() error {
return nil
}

func (i *InstaData) ScrapeRemoteData() error {
client := http.Client{Transport: transportNoProxy, Timeout: timeout}
// Scrape from remote scraper if available

var err error
req, err := http.NewRequest("GET", RemoteScraperAddr+"/scrape/"+i.PostID, nil)
if err != nil {
return err
}
req.Header.Set("Accept-Encoding", "zstd.dict")
res, err := client.Do(req)
if res != nil && res.StatusCode == 200 {
defer res.Body.Close()
zstdReader, err := zstd.NewReader(nil, zstd.WithDecoderLowmem(true), zstd.WithDecoderDicts(zstdDict))
if err != nil {
return err
}
remoteData, err := io.ReadAll(res.Body)
if err == nil {
remoteDecomp, err := zstdReader.DecodeAll(remoteData, nil)
if err != nil {
return err
}
if err = binary.Unmarshal(remoteDecomp, i); err == nil {
slog.Info("Data parsed from remote scraper", "postID", i.PostID)
return nil
}
}
slog.Error("Failed to scrape data from remote scraper", "postID", i.PostID, "status", res.StatusCode, "err", err)
}
return ErrNotFound
}

// Taken from https://github.com/PuerkitoBio/goquery
// Modified to add new line every <br>
func gqTextNewLine(s *goquery.Selection) string {
Expand Down

0 comments on commit 377bf0f

Please sign in to comment.