From fc5fade95ed4f3398e1972f2dd3b4f324000ec47 Mon Sep 17 00:00:00 2001 From: Marlik Almighty Date: Wed, 27 Dec 2023 23:30:31 +0300 Subject: [PATCH] up --- .dockerignore | 11 ++-- .gitignore | 5 +- Dockerfile | 33 ++++------ README.md | 13 +++- analyze.service | 23 +++++++ cmd/main.go | 8 ++- go.mod | 1 + go.sum | 2 + internal/app/core.go | 133 ++++++++++++++++++++++++++------------ internal/app/rzn.go | 31 +++------ internal/app/ya62.go | 39 +++++------ internal/config/config.go | 22 +++++-- internal/store/bolt.go | 8 +-- 13 files changed, 201 insertions(+), 128 deletions(-) create mode 100644 analyze.service diff --git a/.dockerignore b/.dockerignore index 776ca06..a20b559 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,11 +1,8 @@ -.idea/ .git/ -bin/ -swagger.yaml -Procfile -Makefile -Dockerfile -docker-compose.yaml .dockerignore .gitignore +bin/ +Makefile Dockerfile +README.md + diff --git a/.gitignore b/.gitignore index 945dc30..70b1981 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -.idea/ bin/ -config.json -data.db \ No newline at end of file +data.db + diff --git a/Dockerfile b/Dockerfile index c8475cb..5b81fdf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,7 @@ -FROM golang:1.18-alpine AS builder +FROM golang:1.21-alpine3.18 AS builder -ENV CGO_ENABLED 0 ENV TZ=Europe/Moscow -RUN apk update && apk upgrade && apk add --no-cache chromium - -RUN echo @edge http://nl.alpinelinux.org/alpine/edge/community >> /etc/apk/repositories \ - && echo @edge http://nl.alpinelinux.org/alpine/edge/main >> /etc/apk/repositories \ - && apk add --no-cache \ - harfbuzz@edge \ - nss@edge \ - freetype@edge \ - ttf-freefont@edge \ - && rm -rf /var/cache/* \ - && mkdir /var/cache/apk - WORKDIR /go/src/analyze COPY . . @@ -25,15 +12,23 @@ FROM gruebel/upx:latest as upx COPY --from=builder /go/src/analyze/app /app RUN upx --best --lzma -o /analyze /app -FROM scratch +#FROM scratch +FROM golang:1.21-alpine3.18 COPY --from=upx /app /app -ENV BOT_TOKEN="" -ENV CHANNEL="" +RUN echo @edge http://nl.alpinelinux.org/alpine/edge/community >> /etc/apk/repositories \ + && echo @edge http://nl.alpinelinux.org/alpine/edge/main >> /etc/apk/repositories \ + && apk update && apk upgrade \ + && apk add --no-cache ca-certificates && update-ca-certificates \ + && apk add --no-cache chromium chromium-chromedriver \ + && rm -rf /var/cache/* \ + && mkdir /var/cache/apk + ENV RZN_URL="" ENV YA_URL="" -ENV REDIS_URL="redis://127.0.0.1:6379" +ENV BOT_TOKEN="" +ENV MAIN_CHANNEL="-100***" +ENV MODERATOR_CHANNEL="-955***" -EXPOSE 3000 CMD ["/app"] diff --git a/README.md b/README.md index 141543f..5a454c0 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,19 @@ -# Analyze +## Analyze -### A simple news parser for yourself. +### A couple of examples how to parse sites use chromium-chromedriver -### For the parser to work, you need a driver, installation: +Before we need install driver: ```sh sudo apt -y install chromium-chromedriver ``` +### Build and start docker container + +```sh +docker buildx build . -t analyze +docker run -v /dev/shm:/dev/shm -itd --rm analyze +``` + diff --git a/analyze.service b/analyze.service new file mode 100644 index 0000000..1c511c6 --- /dev/null +++ b/analyze.service @@ -0,0 +1,23 @@ +[Unit] +Description=Analyze +After=network.target + +[Service] +Environment="BOT_TOKEN=578***" +Environment="CHANNEL=-100***" +Environment="MODERATOR_CHANNEL=-955***" +Environment="RZN_URL=https://***" +Environment="YA_URL=https://***" +Type=simple +PIDFile=/var/run/analyze.pid +WorkingDirectory=/home/marlik/analyze +ExecStart=/home/marlik/analyze/bin/app +ExecStop=/bin/kill -INT $MAINPID +User=root +Group=root +StandardOutput=file:/var/log/analyze.log +StandardError=file:/var/log/analyze.log +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/cmd/main.go b/cmd/main.go index 6846eac..3e837cb 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -14,15 +14,17 @@ import ( func main() { + var err error + // got config - cnf, err := config.LoadConfig() - if err != nil { + cnf := config.New() + if err = cnf.GetEnv(); err != nil { log.Fatalf("error config: %v\n", err) } // connect to store var r *store.Wrapper - if r, err = r.New("posts", "ttl"); err != nil { + if r, err = r.New(); err != nil { log.Fatalf("error store: %v\n", err) } diff --git a/go.mod b/go.mod index 10e6d6e..727e0d9 100644 --- a/go.mod +++ b/go.mod @@ -18,6 +18,7 @@ require ( github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect + github.com/kelseyhightower/envconfig v1.4.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/onsi/ginkgo v1.16.5 // indirect github.com/onsi/gomega v1.27.8 // indirect diff --git a/go.sum b/go.sum index d204280..7b5fe23 100644 --- a/go.sum +++ b/go.sum @@ -38,6 +38,8 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/kelseyhightower/envconfig v1.4.0 h1:Im6hONhd3pLkfDFsbRgu68RDNkGF1r3dvMUtDTo2cv8= +github.com/kelseyhightower/envconfig v1.4.0/go.mod h1:cccZRl6mQpaq41TPp5QxidR+Sa3axMbJDNb//FQX6Gg= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= diff --git a/internal/app/core.go b/internal/app/core.go index 2ad52ae..1b98895 100644 --- a/internal/app/core.go +++ b/internal/app/core.go @@ -33,6 +33,7 @@ type ( Run() Stop() //browser(opts []chromedp.ExecAllocatorOption, url string) (string, error) + checkLink(m map[string]string) (map[string]string, error) checkPreSend(v models.Post) error checkBlank(v *models.Post) bool sendToModerChannel(p *models.Post) error @@ -54,7 +55,7 @@ type ( func (core *Core) Run() { - statsInt := core.mustParseDuration("1h") + statsInt := core.mustParseDuration("30m") statsTimer := time.NewTimer(statsInt) mp := make(map[string]string) @@ -67,7 +68,13 @@ func (core *Core) Run() { // init driver driver := agouti.ChromeDriver( - agouti.ChromeOptions("args", []string{"--headless", "--disable-gpu", "--no-sandbox"}), + agouti.ChromeOptions("args", []string{ + "--headless", + "--disable-gpu", + "--no-sandbox", + "--whitelisted-ips", + }), + // agouti.Debug, ) // ttl all posts @@ -77,6 +84,11 @@ func (core *Core) Run() { statsTimer.Reset(statsInt) + if err = driver.Start(); err != nil { + log.Printf("error driver start: %v\n", err) + return + } + func() { log.Println("start clear database") if err = core.Store.Sweep(maxAge); err != nil { @@ -85,20 +97,21 @@ func (core *Core) Run() { } }() - if err = driver.Start(); err != nil { - log.Printf("error driver start: %v\n", err) - return - } - - if page, err = driver.NewPage(); err != nil { - log.Printf("error new page: %v\n", err) - return - } - func() { log.Println("start parsing rzn.info") + page, err = driver.NewPage() + if err != nil { + log.Println("[RZN]: error new page") + } + + defer func() { + if err = page.Destroy(); err != nil { + log.Println("[RZN]: error page destroy") + } + }() + if err = page.Navigate(core.Config.RznUrl); err != nil { log.Println("[RZN]: error got main page: " + err.Error()) return @@ -115,6 +128,11 @@ func (core *Core) Run() { return } + if mp, err = core.checkLink(mp); err != nil { + log.Println("[RZN]: error check link from rzn: " + err.Error()) + return + } + // range for links for url := range mp { @@ -148,6 +166,17 @@ func (core *Core) Run() { log.Println("start parsing ya62.ru") + page, err = driver.NewPage() + if err != nil { + log.Println("[YA62]: error new page") + } + + defer func() { + if err = page.Destroy(); err != nil { + log.Println("[YA62]: error page destroy") + } + }() + // get start page ya62.ru/news/incidents/ if err = page.Navigate(core.Config.YaUrl); err != nil { log.Println("[YA62]: error got main page: " + err.Error()) @@ -165,13 +194,18 @@ func (core *Core) Run() { return } + if mp, err = core.checkLink(mp); err != nil { + log.Printf("[YA62]: error check link from ya: " + err.Error()) + return + } + // range for links for url := range mp { time.Sleep(10 * time.Second) if err = page.Navigate(url); err != nil { - log.Println("[YA62]: error got main page: " + err.Error()) + log.Println("[YA62]: error got target page: " + err.Error()) return } @@ -192,59 +226,76 @@ func (core *Core) Run() { continue } } - }() - log.Println("Timeout 1 hour...") + }() if err = driver.Stop(); err != nil { log.Printf("error driver stop: %v\n", err) } + log.Println("Timeout 30 minutes...") + <-statsTimer.C } } -func (core *Core) checkPreSend(v models.Post) error { +func (core *Core) checkLink(m map[string]string) (map[string]string, error) { var ( - b []byte - err error + b []byte + hash string + err error ) + newMap := make(map[string]string) + + for k, v := range m { + + hash = core.stringToHash(k) + + if b, err = core.Store.Read("posts", hash); err != nil { + log.Printf("error read from db: %v\n", err) + panic(err) + } + + if len(b) == 0 { + newMap[k] = v + } + } + + return newMap, nil +} + +func (core *Core) checkPreSend(v models.Post) error { + + var err error + // checking for missing fields in a structure if core.checkBlank(&v) { return nil } - // read from the database post with this hash - b, err = core.Store.Read("posts", v.Hash) - if err != nil { + if err = core.sendToModerChannel(&v); err != nil { return err } - // there is no such post, send to the moderator channel - if len(b) == 0 { - if err = core.sendToModerChannel(&v); err != nil { - return err - } - - // marshal post - var post []byte - if post, err = json.Marshal(v); err != nil { - return err - } + // marshal post + var post []byte + if post, err = json.Marshal(v); err != nil { + return err + } - // write post to database - if err = core.Store.Write("posts", v.Hash, post); err != nil { - return err - } + // write post to database + if err = core.Store.Write("posts", v.Hash, post); err != nil { + return err + } - // writing ttl posts to database - if err = core.Store.Write("ttl", time.Now().UTC().Format(time.RFC3339Nano), - []byte(v.Hash)); err != nil { - return err - } + // writing ttl posts to database + if err = core.Store.Write("ttl", time.Now().UTC().Format(time.RFC3339Nano), + []byte(v.Hash)); err != nil { + return err } + return nil } diff --git a/internal/app/rzn.go b/internal/app/rzn.go index b922ce3..418774b 100644 --- a/internal/app/rzn.go +++ b/internal/app/rzn.go @@ -35,47 +35,32 @@ func (core *Core) getLinkRzn(html string) (map[string]string, error) { func (core *Core) catchPostFromRzn(html string) (models.Post, error) { - reg := regexp.MustCompile(`\s+`) + dote := regexp.MustCompile(`\.`) var ( - title, link, img string - err error + post models.Post + err error ) - post := models.Post{} - var doc *goquery.Document if doc, err = goquery.NewDocumentFromReader(strings.NewReader(html)); err != nil { return post, errors.New(err.Error()) } doc.Find("#newsContainer > div.row.url-checkpoint.newsItem.story > div.col.story__details > div > div.story__body > div.story__hero > div > img").Each(func(i int, s *goquery.Selection) { - img, _ = s.Attr("src") + post.Image, _ = s.Attr("src") }) doc.Find("#newsContainer > div.row.url-checkpoint.newsItem.story").Each(func(i int, s *goquery.Selection) { - title, _ = s.Attr("data-title") - link, _ = s.Attr("data-url") + post.Title, _ = s.Attr("data-title") + post.Link, _ = s.Attr("data-url") }) - var ( - str []string - body string - txt string - ) - doc.Find("#newsContainer > div.row.url-checkpoint.newsItem.story > div.col.story__details > div > div.story__body > div:nth-child(3)").Each(func(i int, s *goquery.Selection) { - txt = s.Text() - newTxt := reg.ReplaceAllString(txt, " ") - str = append(str, newTxt) + post.Body = dote.ReplaceAllString(s.Text(), ". ") }) - body = strings.Join(str, "") - post.Hash = core.stringToHash(title) - post.Title = title - post.Body = body - post.Image = img - post.Link = link + post.Hash = core.stringToHash(post.Link) return post, nil } diff --git a/internal/app/ya62.go b/internal/app/ya62.go index d92505c..320bf94 100644 --- a/internal/app/ya62.go +++ b/internal/app/ya62.go @@ -2,10 +2,11 @@ package app import ( "errors" - "github.com/MarlikAlmighty/analyze-it/internal/models" - "github.com/PuerkitoBio/goquery" "regexp" "strings" + + "github.com/MarlikAlmighty/analyze-it/internal/models" + "github.com/PuerkitoBio/goquery" ) func (core *Core) getLinkYa(html string) (map[string]string, error) { @@ -38,41 +39,41 @@ func (core *Core) catchPostFromYa(html, link string) (models.Post, error) { space := regexp.MustCompile(`[[:space:]]`) all := regexp.MustCompile(`\s+`) + tag := regexp.MustCompile(`[<\.+>]`) post := models.Post{} var ( - doc *goquery.Document - err error + doc *goquery.Document + body string + err error ) if doc, err = goquery.NewDocumentFromReader(strings.NewReader(html)); err != nil { return post, errors.New(err.Error()) } - var title, txt, img string - doc.Find("div.news-detail > h1").Each(func(i int, s *goquery.Selection) { - title = s.Text() + post.Title = s.Text() }) doc.Find("div.news-detail > figure > img").Each(func(i int, s *goquery.Selection) { - img, _ = s.Attr("src") - img = "https://ya62.ru" + img + tmp, _ := s.Attr("src") + post.Image = "https://ya62.ru" + tmp }) doc.Find("div.news-detail p").Each(func(i int, s *goquery.Selection) { - txt += s.Text() - txt = space.ReplaceAllString(txt, " ") - txt = all.ReplaceAllString(txt, " ") - txt = strings.Replace(txt, "<...>", "", 3) - txt = strings.Replace(txt, "YA62.ru", "", 3) - txt = strings.TrimSpace(txt) + body += s.Text() }) - post.Hash = core.stringToHash(title) - post.Title = title - post.Body = txt - post.Image = img + body = space.ReplaceAllString(body, " ") + body = all.ReplaceAllString(body, " ") + body = strings.Replace(body, "<...>", "", -1) + body = tag.ReplaceAllString(body, "") + body = strings.Replace(body, "YA62.ru", "", 3) + body = strings.TrimSpace(body) + + post.Hash = core.stringToHash(link) + post.Body = body post.Link = link return post, nil diff --git a/internal/config/config.go b/internal/config/config.go index ee88125..e217184 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -1,10 +1,6 @@ package config -import ( - "encoding/json" - "io" - "os" -) +import "github.com/kelseyhightower/envconfig" // Configuration of app type Configuration struct { @@ -15,13 +11,26 @@ type Configuration struct { ModeratorChannel int64 `required:"true" split_words:"true"` } +func New() *Configuration { + return &Configuration{} +} + +// GetEnv configuration init +func (cnf *Configuration) GetEnv() error { + if err := envconfig.Process("", cnf); err != nil { + return err + } + return nil +} + +/* // LoadConfig load configuration from file func LoadConfig() (*Configuration, error) { var ( jsonFile *os.File err error ) - if jsonFile, err = os.Open("config.json"); err != nil { + if jsonFile, err = os.Open("./config.json"); err != nil { return nil, err } var b []byte @@ -34,3 +43,4 @@ func LoadConfig() (*Configuration, error) { } return conf, err } +*/ diff --git a/internal/store/bolt.go b/internal/store/bolt.go index 0f9fa45..8c20774 100644 --- a/internal/store/bolt.go +++ b/internal/store/bolt.go @@ -11,9 +11,9 @@ type Wrapper struct { } // New database and create buckets -func (r *Wrapper) New(posts, ttl string) (*Wrapper, error) { +func (r *Wrapper) New() (*Wrapper, error) { - db, err := bolt.Open("data.db", 0600, &bolt.Options{Timeout: 1 * time.Second}) + db, err := bolt.Open("./data.db", 0600, &bolt.Options{Timeout: 1 * time.Second}) if err != nil { return nil, err } @@ -107,9 +107,9 @@ func (r *Wrapper) GetExpired(maxAge time.Duration) ([][]byte, error) { err := r.DB.View(func(tx *bolt.Tx) error { c := tx.Bucket([]byte("ttl")).Cursor() - max := []byte(time.Now().UTC().Add(-maxAge).Format(time.RFC3339Nano)) + maxB := []byte(time.Now().UTC().Add(-maxAge).Format(time.RFC3339Nano)) - for k, v := c.First(); k != nil && bytes.Compare(k, max) <= 0; k, v = c.Next() { + for k, v := c.First(); k != nil && bytes.Compare(k, maxB) <= 0; k, v = c.Next() { keys = append(keys, v) ttlKeys = append(ttlKeys, k) }