Skip to content

Commit afbe782

Browse files
committed
Add Docker support with Dockerfile and docker-compose.yaml; restructure project files and update requirements
1 parent c4c70d6 commit afbe782

10 files changed

+148
-58
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@ __pycache__/
33
*.py[cod]
44
*$py.class
55

6+
#PynewFiles
7+
pynews*
8+
format.json
9+
export.py
10+
611
# C extensions
712
*.so
813

Dockerfile

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
FROM ubuntu:22.04
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
RUN apt-get update && apt-get install -y \
6+
python3-pip \
7+
python3.10 \
8+
tzdata && \
9+
ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \
10+
dpkg-reconfigure --frontend noninteractive tzdata && \
11+
apt-get clean && rm -rf /var/lib/apt/lists/* &&\
12+
mkdir -p app && \
13+
pip3 install -U crawl4ai && \
14+
playwright install && \
15+
playwright install-deps
16+
17+
WORKDIR /app
18+
19+
COPY ./app/. .
20+
COPY requirements.txt .
21+
22+
RUN pip3 install -r requirements.txt

README.md

+33-31
Original file line numberDiff line numberDiff line change
@@ -6,33 +6,32 @@ Este Scraper utiliza IA para que não seja necessário a sua atualização const
66

77
# Como utilizar
88

9-
Para criar um ambiente virtual, execute
10-
```
11-
python -m venv .venv
12-
```
9+
## Instalação do Docker e Docker Compose
1310

14-
Para ativar o ambiente virtual, execute
15-
```
16-
source .venv/bin/activate
17-
```
11+
### Ubuntu
1812

19-
Para instalar as dependências do projeto no ambiente virtual, execute
20-
```
21-
pip install -r requirements.txt
22-
```
13+
1. **Instale o Docker:**
2314

24-
Para completar a instalação do Crawl4AI
25-
```
26-
# Install the package
27-
pip install -U crawl4ai
15+
https://www.digitalocean.com/community/tutorials/how-to-install-and-use-docker-on-ubuntu-22-04
2816

29-
# Run post-installation setup
30-
crawl4ai-setup
17+
2. **Instale o Docker Compose:**
18+
19+
https://www.digitalocean.com/community/tutorials/how-to-install-and-use-docker-compose-on-ubuntu-22-04
20+
21+
22+
### Windows
23+
24+
1. **Instale o Docker:**
25+
26+
https://docs.docker.com/desktop/setup/install/windows-install/
27+
28+
2. **Instale o Docker Compose:**
29+
30+
https://docs.docker.com/compose/install/
31+
32+
33+
### Crie uma chave junto a COHERE
3134

32-
# Verify your installation
33-
crawl4ai-doctor
34-
```
35-
Crie uma chave junto a COHERE
3635
- https://docs.cohere.com/
3736
<p>
3837

@@ -42,20 +41,23 @@ Atualize o valor em prompt.py
4241
os.environ["COHERE_API_KEY"] = "<YOUR-API-KEY>"
4342
...
4443
```
45-
Instale o Thinker
46-
```cmd
47-
sudo apt-get install python3-tk
44+
45+
## Construa o container
46+
```sh
47+
docker compose build
4848
```
4949

50-
Para procurar por releases, execute o comando abaixo e aguarde as instruções
50+
## Para procurar por releases, execute o comando abaixo e aguarde as instruções
5151
```
52-
python getNews.py releases
52+
docker compose run --rm --remove-orphans pynews python3 /app/app/getNews.py releases
5353
```
5454

55-
Para criar os resumos, execute
55+
## Para criar os resumos, execute
5656
```
57-
python getNews.py slides
57+
docker compose run --rm --remove-orphans pynews python3 /app/app/getNews.py slides
5858
```
59+
<p>
60+
<p>
61+
<p>
5962

60-
61-
Para desativar o ambiente virtual, execute `deactivate`.
63+
## Esse é um script que depende da COHERE AI, essa IA assim como todas as outras, ainda não apresentam comportamento estável, portanto esse script deve ser usado sob supervisão

app/__init__.py

Whitespace-only changes.

bibliotecas.py app/bibliotecas.py

+32-1
Original file line numberDiff line numberDiff line change
@@ -3,126 +3,151 @@
33
"library_name": "Requests",
44
"releases_url": "https://pypi.org/project/requests/",
55
"logo": "https://requests.readthedocs.io/en/latest/_static/requests-sidebar.png",
6+
"repository": "https://github.com/psf/requests",
67
},
78
"Scikit-learn": {
89
"library_name": "Scikit-learn",
910
"releases_url": "https://pypi.org/project/scikit-learn/",
1011
"logo": "https://scikit-learn.org/stable/_static/scikit-learn-logo-small.png",
12+
"repository": "https://github.com/scikit-learn/scikit-learn",
1113
},
1214
"Numpy": {
1315
"library_name": "Numpy",
1416
"releases_url": "https://pypi.org/project/numpy/",
1517
"logo": "https://numpy.org/devdocs/_static/numpylogo.svg",
18+
"repository": "https://github.com/numpy/numpy",
1619
},
1720
"MatPlotLib": {
1821
"library_name": "MatPlotLib",
1922
"releases_url": "https://pypi.org/project/matplotlib/",
2023
"logo": "https://matplotlib.org/stable/_static/logo_light.svg",
24+
"repository": "https://github.com/matplotlib/matplotlib",
2125
},
2226
"AIOHttp": {
2327
"library_name": "AIOHttp",
2428
"releases_url": "https://pypi.org/project/aiohttp/",
2529
"logo": "https://docs.aiohttp.org/en/stable/_static/aiohttp-plain.svg",
30+
"repository": "https://github.com/aio-libs/aiohttp",
2631
},
2732
"Pandas": {
2833
"library_name": "Pandas",
2934
"releases_url": "https://pypi.org/project/pandas/",
3035
"logo": "https://pandas.pydata.org/static/img/pandas_mark.svg",
36+
"repository": "https://github.com/pandas-dev/pandas",
3137
},
3238
"FastAPI": {
3339
"library_name": "FastAPI",
3440
"releases_url": "https://pypi.org/project/fastapi/",
35-
"logo": "https://fastapi.tiangolo.com/img/icon.png",
41+
"logo": "https://camo.githubusercontent.com/4ebb06d037b495f2c4c67e0ee4599f747e94e6323ece758a7da27fbbcb411250/68747470733a2f2f666173746170692e7469616e676f6c6f2e636f6d2f696d672f6c6f676f2d6d617267696e2f6c6f676f2d7465616c2e706e67",
42+
"repository": "https://github.com/fastapi/fastapi",
3643
},
3744
"Django": {
3845
"library_name": "Django",
3946
"releases_url": "https://pypi.org/project/Django/",
4047
"logo": "https://static.djangoproject.com/img/logos/django-logo-negative.png",
48+
"repository": "https://github.com/django/django",
4149
},
4250
"Seaborn": {
4351
"library_name": "Seaborn",
4452
"releases_url": "https://pypi.org/project/seaborn/",
4553
"logo": "https://seaborn.pydata.org/_images/logo-wide-lightbg.svg",
54+
"repository": "https://github.com/mwaskom/seaborn",
4655
},
4756
"TensorFlow": {
4857
"library_name": "TensorFlow",
4958
"releases_url": "https://pypi.org/project/tensorflow/",
5059
"logo": "https://www.tensorflow.org/images/tf_logo_social.png",
60+
"repository": "https://github.com/tensorflow/tensorflow",
5161
},
5262
"Keras": {
5363
"library_name": "Keras",
5464
"releases_url": "https://pypi.org/project/keras/",
5565
"logo": "https://keras.io/img/logo.png",
66+
"repository": "https://github.com/keras-team/keras",
5667
},
5768
"PyTorch": {
5869
"library_name": "PyTorch",
5970
"releases_url": "https://pypi.org/project/torch/",
6071
"logo": "https://pytorch.org/assets/images/pytorch-logo.png",
72+
"repository": "https://github.com/pytorch/pytorch",
6173
},
6274
"SQLAlchemy": {
6375
"library_name": "SQLAlchemy",
6476
"releases_url": "https://pypi.org/project/SQLAlchemy/",
6577
"logo": "https://www.sqlalchemy.org/img/sqla_logo.png",
78+
"repository": "https://github.com/sqlalchemy/sqlalchemy",
6679
},
6780
"BeaultifulSoup": {
6881
"library_name": "BeaultifulSoup",
6982
"releases_url": "https://pypi.org/project/beautifulsoup4/",
7083
"logo": "https://www.crummy.com/software/BeautifulSoup/10.1.jpg",
84+
"repository": None,
7185
},
7286
"LangChain": {
7387
"library_name": "LangChain",
7488
"releases_url": "https://pypi.org/project/langchain/",
7589
"logo": "https://python.langchain.com/img/brand/wordmark-dark.png",
90+
"repository": "https://github.com/langchain-ai/langchain",
7691
},
7792
"CrewAI": {
7893
"library_name": "CrewAI",
7994
"releases_url": "https://pypi.org/project/crewai/",
8095
"logo": "https://cdn.prod.website-files.com/66cf2bfc3ed15b02da0ca770/66d07240057721394308addd_Logo%20(1).svg",
96+
"repository": "https://github.com/crewAIInc/crewAI",
8197
},
8298
"Flask": {
8399
"library_name": "Flask",
84100
"releases_url": "https://pypi.org/project/Flask/",
85101
"logo": "https://flask.palletsprojects.com/en/stable/_static/flask-vertical.png",
102+
"repository": "https://github.com/pallets/flask",
86103
},
87104
"Pygame": {
88105
"library_name": "Pygame",
89106
"releases_url": "https://pypi.org/project/pygame/",
90107
"logo": "https://www.pygame.org/images/logo_lofi.png",
108+
"repository": "https://github.com/pygame/pygame",
91109
},
92110
"Thinker": {
93111
"library_name": "Thinker",
94112
"releases_url": "https://pypi.org/project/thinker/",
95113
"logo": "https://keras.io/img/logo.png",
114+
"repository": "https://github.com/mehmetkose/thinker",
96115
},
97116
"Plotly": {
98117
"library_name": "Plotly",
99118
"releases_url": "https://pypi.org/project/plotly/",
100119
"logo": "https://plotly.com/static/img/logos/plotly-logomark.svg",
120+
"repository": "https://github.com/plotly/plotly.py",
101121
},
102122
"MlForecast": {
103123
"library_name": "MlForecast",
104124
"releases_url": "https://pypi.org/project/mlforecast/",
105125
"logo": "https://raw.githubusercontent.com/Nixtla/mlforecast/main/nbs/figs/logo.png",
126+
"repository": "https://github.com/Nixtla/mlforecast",
106127
},
107128
"GeoPandas": {
108129
"library_name": "GeoPandas",
109130
"releases_url": "https://pypi.org/project/geopandas/",
110131
"logo": "https://geopandas.org/en/stable/_static/geopandas_logo_web.svg",
132+
"repository": "https://github.com/geopandas/geopandas",
111133
},
112134
"AirFlow": {
113135
"library_name": "AirFlow",
114136
"releases_url": "https://pypi.org/project/apache-airflow/",
115137
"logo": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/de/AirflowLogo.png/800px-AirflowLogo.png?20191014185111",
138+
"repository": "https://github.com/apache/airflow",
116139
},
117140
"PySpark": {
118141
"library_name": "PySpark",
119142
"releases_url": "https://pypi.org/project/pyspark/",
120143
"logo": "https://spark.apache.org/docs/latest/api/python/_static/spark-logo-reverse.png",
144+
"repository": "https://github.com/apache/spark/tree/master/python",
121145
},
122146
"Gym": {
123147
"library_name": "Gym",
124148
"releases_url": "https://pypi.org/project/gym/",
125149
"logo": "https://www.gymlibrary.dev/_static/img/gym_logo_black.svg",
150+
"repository": "https://github.com/Farama-Foundation/Gymnasium",
126151
},
127152
"HyperOpt": {
128153
"library_name": "HyperOpt",
@@ -139,4 +164,10 @@
139164
"releases_url": "https://crawl4ai.com/mkdocs/blog/",
140165
"logo": "https://star-history.com/#unclecode/crawl4ai&Date",
141166
},
167+
"ScanAPI": {
168+
"library_name": "ScanAPI",
169+
"releases_url": "https://pypi.org/project/scanapi/",
170+
"logo": "https://avatars.githubusercontent.com/u/59395469?s=200&v=4",
171+
"repository": "https://github.com/scanapi/scanapi",
172+
},
142173
}
File renamed without changes.

getNews.py app/getNews.py

+26-13
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
import time
55
from datetime import datetime
66
from json import dump
7-
from tkinter import messagebox
87

8+
from bibliotecas import bibliotecas
9+
from cacheVariables import pynews
910
from crawl4ai import (
1011
AsyncWebCrawler,
1112
BrowserConfig,
@@ -14,9 +15,6 @@
1415
DefaultMarkdownGenerator,
1516
PruningContentFilter,
1617
)
17-
18-
from bibliotecas import bibliotecas
19-
from cacheVariables import pynews
2018
from prompt import Smart
2119

2220
md_generator = DefaultMarkdownGenerator(
@@ -60,7 +58,16 @@ async def fetch(self, lib, url_name):
6058
)
6159
new_html = html.markdown_v2
6260

63-
response = Smart().answer(url_name, lib, new_html.fit_markdown)
61+
response = {}
62+
try:
63+
response = Smart().answer(url_name, lib, new_html.fit_markdown)
64+
except Exception:
65+
print("$$$ COHERE AI Time Out $$$")
66+
print("Biblioteca não processada :", lib["library_name"])
67+
print(
68+
"Tente rodar o script novamente somente com a lib : ",
69+
lib["library_name"],
70+
)
6471

6572
release_date = response.get("release_date")
6673

@@ -113,12 +120,6 @@ async def main():
113120
f"\033[1;37;44m\033[1;30m\n Edite o arquivo {news_json_file} \
114121
para adicionar as urls contendo o descritivo das novas releases \n"
115122
)
116-
messagebox.showwarning(
117-
"Aviso",
118-
f"Edite o arquivo {news_json_file} para adicionar as urls contendo \
119-
o descritivo das novas releases",
120-
icon="warning",
121-
)
122123

123124

124125
async def slides():
@@ -127,8 +128,20 @@ async def slides():
127128
with open(news_json_file, "r", encoding="utf-8") as f:
128129
news.study_case = json.load(f)
129130
await news.get("releases_doc_url")
130-
with open(news_json_file, "w", encoding="utf-8") as f:
131-
dump(pynews, f)
131+
slides = {}
132+
try:
133+
with open("pynews_slides.json", "r", encoding="utf-8") as f:
134+
slides = json.load(f)
135+
except Exception:
136+
pass
137+
for item in pynews:
138+
slides[item] = pynews[item]
139+
with open("pynews_slides.json", "w", encoding="utf-8") as f:
140+
dump(slides, f)
141+
print(
142+
"\033[1;37;44m\033[1;30m\n Abra o arquivo pynews_slides.json para \
143+
ter acesso ao conteúdo produzido.\n"
144+
)
132145

133146

134147
if __name__ == "__main__":

0 commit comments

Comments
 (0)