Skip to content

Commit

Permalink
up
Browse files Browse the repository at this point in the history
  • Loading branch information
MarlikAlmighty committed Dec 27, 2023
1 parent c10f0fa commit fc5fade
Show file tree
Hide file tree
Showing 13 changed files with 201 additions and 128 deletions.
11 changes: 4 additions & 7 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
.idea/
.git/
bin/
swagger.yaml
Procfile
Makefile
Dockerfile
docker-compose.yaml
.dockerignore
.gitignore
bin/
Makefile
Dockerfile
README.md

5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
.idea/
bin/
config.json
data.db
data.db

33 changes: 14 additions & 19 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,20 +1,7 @@
FROM golang:1.18-alpine AS builder
FROM golang:1.21-alpine3.18 AS builder

ENV CGO_ENABLED 0
ENV TZ=Europe/Moscow

RUN apk update && apk upgrade && apk add --no-cache chromium

RUN echo @edge http://nl.alpinelinux.org/alpine/edge/community >> /etc/apk/repositories \
&& echo @edge http://nl.alpinelinux.org/alpine/edge/main >> /etc/apk/repositories \
&& apk add --no-cache \
harfbuzz@edge \
nss@edge \
freetype@edge \
ttf-freefont@edge \
&& rm -rf /var/cache/* \
&& mkdir /var/cache/apk

WORKDIR /go/src/analyze

COPY . .
Expand All @@ -25,15 +12,23 @@ FROM gruebel/upx:latest as upx
COPY --from=builder /go/src/analyze/app /app
RUN upx --best --lzma -o /analyze /app

FROM scratch
#FROM scratch
FROM golang:1.21-alpine3.18

COPY --from=upx /app /app

ENV BOT_TOKEN=""
ENV CHANNEL=""
RUN echo @edge http://nl.alpinelinux.org/alpine/edge/community >> /etc/apk/repositories \
&& echo @edge http://nl.alpinelinux.org/alpine/edge/main >> /etc/apk/repositories \
&& apk update && apk upgrade \
&& apk add --no-cache ca-certificates && update-ca-certificates \
&& apk add --no-cache chromium chromium-chromedriver \
&& rm -rf /var/cache/* \
&& mkdir /var/cache/apk

ENV RZN_URL=""
ENV YA_URL=""
ENV REDIS_URL="redis://127.0.0.1:6379"
ENV BOT_TOKEN=""
ENV MAIN_CHANNEL="-100***"
ENV MODERATOR_CHANNEL="-955***"

EXPOSE 3000
CMD ["/app"]
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
# Analyze
## Analyze

### A simple news parser for yourself.
### A couple of examples how to parse sites use chromium-chromedriver

### For the parser to work, you need a driver, installation:
Before we need install driver:

```sh
sudo apt -y install chromium-chromedriver
```

### Build and start docker container

```sh
docker buildx build . -t analyze
docker run -v /dev/shm:/dev/shm -itd --rm analyze
```



23 changes: 23 additions & 0 deletions analyze.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[Unit]
Description=Analyze
After=network.target

[Service]
Environment="BOT_TOKEN=578***"
Environment="CHANNEL=-100***"
Environment="MODERATOR_CHANNEL=-955***"
Environment="RZN_URL=https://***"
Environment="YA_URL=https://***"
Type=simple
PIDFile=/var/run/analyze.pid
WorkingDirectory=/home/marlik/analyze
ExecStart=/home/marlik/analyze/bin/app
ExecStop=/bin/kill -INT $MAINPID
User=root
Group=root
StandardOutput=file:/var/log/analyze.log
StandardError=file:/var/log/analyze.log
Restart=always

[Install]
WantedBy=multi-user.target
8 changes: 5 additions & 3 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,17 @@ import (

func main() {

var err error

// got config
cnf, err := config.LoadConfig()
if err != nil {
cnf := config.New()
if err = cnf.GetEnv(); err != nil {
log.Fatalf("error config: %v\n", err)
}

// connect to store
var r *store.Wrapper
if r, err = r.New("posts", "ttl"); err != nil {
if r, err = r.New(); err != nil {
log.Fatalf("error store: %v\n", err)
}

Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ require (
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/kelseyhightower/envconfig v1.4.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/onsi/ginkgo v1.16.5 // indirect
github.com/onsi/gomega v1.27.8 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/kelseyhightower/envconfig v1.4.0 h1:Im6hONhd3pLkfDFsbRgu68RDNkGF1r3dvMUtDTo2cv8=
github.com/kelseyhightower/envconfig v1.4.0/go.mod h1:cccZRl6mQpaq41TPp5QxidR+Sa3axMbJDNb//FQX6Gg=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
Expand Down
133 changes: 92 additions & 41 deletions internal/app/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type (
Run()
Stop()
//browser(opts []chromedp.ExecAllocatorOption, url string) (string, error)
checkLink(m map[string]string) (map[string]string, error)
checkPreSend(v models.Post) error
checkBlank(v *models.Post) bool
sendToModerChannel(p *models.Post) error
Expand All @@ -54,7 +55,7 @@ type (

func (core *Core) Run() {

statsInt := core.mustParseDuration("1h")
statsInt := core.mustParseDuration("30m")
statsTimer := time.NewTimer(statsInt)
mp := make(map[string]string)

Expand All @@ -67,7 +68,13 @@ func (core *Core) Run() {

// init driver
driver := agouti.ChromeDriver(
agouti.ChromeOptions("args", []string{"--headless", "--disable-gpu", "--no-sandbox"}),
agouti.ChromeOptions("args", []string{
"--headless",
"--disable-gpu",
"--no-sandbox",
"--whitelisted-ips",
}),
// agouti.Debug,
)

// ttl all posts
Expand All @@ -77,6 +84,11 @@ func (core *Core) Run() {

statsTimer.Reset(statsInt)

if err = driver.Start(); err != nil {
log.Printf("error driver start: %v\n", err)
return
}

func() {
log.Println("start clear database")
if err = core.Store.Sweep(maxAge); err != nil {
Expand All @@ -85,20 +97,21 @@ func (core *Core) Run() {
}
}()

if err = driver.Start(); err != nil {
log.Printf("error driver start: %v\n", err)
return
}

if page, err = driver.NewPage(); err != nil {
log.Printf("error new page: %v\n", err)
return
}

func() {

log.Println("start parsing rzn.info")

page, err = driver.NewPage()
if err != nil {
log.Println("[RZN]: error new page")
}

defer func() {
if err = page.Destroy(); err != nil {
log.Println("[RZN]: error page destroy")
}
}()

if err = page.Navigate(core.Config.RznUrl); err != nil {
log.Println("[RZN]: error got main page: " + err.Error())
return
Expand All @@ -115,6 +128,11 @@ func (core *Core) Run() {
return
}

if mp, err = core.checkLink(mp); err != nil {
log.Println("[RZN]: error check link from rzn: " + err.Error())
return
}

// range for links
for url := range mp {

Expand Down Expand Up @@ -148,6 +166,17 @@ func (core *Core) Run() {

log.Println("start parsing ya62.ru")

page, err = driver.NewPage()
if err != nil {
log.Println("[YA62]: error new page")
}

defer func() {
if err = page.Destroy(); err != nil {
log.Println("[YA62]: error page destroy")
}
}()

// get start page ya62.ru/news/incidents/
if err = page.Navigate(core.Config.YaUrl); err != nil {
log.Println("[YA62]: error got main page: " + err.Error())
Expand All @@ -165,13 +194,18 @@ func (core *Core) Run() {
return
}

if mp, err = core.checkLink(mp); err != nil {
log.Printf("[YA62]: error check link from ya: " + err.Error())
return
}

// range for links
for url := range mp {

time.Sleep(10 * time.Second)

if err = page.Navigate(url); err != nil {
log.Println("[YA62]: error got main page: " + err.Error())
log.Println("[YA62]: error got target page: " + err.Error())
return
}

Expand All @@ -192,59 +226,76 @@ func (core *Core) Run() {
continue
}
}
}()

log.Println("Timeout 1 hour...")
}()

if err = driver.Stop(); err != nil {
log.Printf("error driver stop: %v\n", err)
}

log.Println("Timeout 30 minutes...")

<-statsTimer.C
}
}

func (core *Core) checkPreSend(v models.Post) error {
func (core *Core) checkLink(m map[string]string) (map[string]string, error) {

var (
b []byte
err error
b []byte
hash string
err error
)

newMap := make(map[string]string)

for k, v := range m {

hash = core.stringToHash(k)

if b, err = core.Store.Read("posts", hash); err != nil {
log.Printf("error read from db: %v\n", err)
panic(err)
}

if len(b) == 0 {
newMap[k] = v
}
}

return newMap, nil
}

func (core *Core) checkPreSend(v models.Post) error {

var err error

// checking for missing fields in a structure
if core.checkBlank(&v) {
return nil
}

// read from the database post with this hash
b, err = core.Store.Read("posts", v.Hash)
if err != nil {
if err = core.sendToModerChannel(&v); err != nil {
return err
}

// there is no such post, send to the moderator channel
if len(b) == 0 {
if err = core.sendToModerChannel(&v); err != nil {
return err
}

// marshal post
var post []byte
if post, err = json.Marshal(v); err != nil {
return err
}
// marshal post
var post []byte
if post, err = json.Marshal(v); err != nil {
return err
}

// write post to database
if err = core.Store.Write("posts", v.Hash, post); err != nil {
return err
}
// write post to database
if err = core.Store.Write("posts", v.Hash, post); err != nil {
return err
}

// writing ttl posts to database
if err = core.Store.Write("ttl", time.Now().UTC().Format(time.RFC3339Nano),
[]byte(v.Hash)); err != nil {
return err
}
// writing ttl posts to database
if err = core.Store.Write("ttl", time.Now().UTC().Format(time.RFC3339Nano),
[]byte(v.Hash)); err != nil {
return err
}

return nil
}

Expand Down
Loading

0 comments on commit fc5fade

Please sign in to comment.