|
1 |
| -FROM python:3.11-slim |
2 |
| - |
3 |
| -ENV DEBIAN_FRONTEND="noninteractive" |
4 |
| - |
5 |
| -LABEL org.opencontainers.image.title="FollowTheMoney File Ingestors" |
6 |
| -LABEL org.opencontainers.image.licenses="MIT" |
7 |
| -LABEL org.opencontainers.image.source="https://github.com/alephdata/ingest-file" |
8 |
| - |
9 |
| -# Enable non-free archive for `unrar`. |
10 |
| -RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \ |
11 |
| - && apt-get -qq -y update \ |
12 |
| - && apt-get -qq -y install build-essential locales \ |
13 |
| - # python deps (mostly to install their dependencies) |
14 |
| - git python3-dev \ |
15 |
| - pkg-config libicu-dev \ |
16 |
| - # tesseract |
17 |
| - tesseract-ocr libtesseract-dev libleptonica-dev \ |
18 |
| - # libraries |
19 |
| - libldap2-dev libsasl2-dev \ |
20 |
| - # package tools |
21 |
| - unrar p7zip-full \ |
22 |
| - # audio & video metadata |
23 |
| - libmediainfo-dev \ |
24 |
| - # image processing, djvu |
25 |
| - mdbtools djvulibre-bin \ |
26 |
| - libtiff5-dev \ |
27 |
| - libtiff-tools ghostscript librsvg2-bin jbig2dec \ |
28 |
| - pst-utils libgif-dev \ |
29 |
| - # necessary for python-magic |
30 |
| - libmagic1 \ |
31 |
| - ### tesseract |
32 |
| - tesseract-ocr-eng \ |
33 |
| - tesseract-ocr-swa \ |
34 |
| - tesseract-ocr-swe \ |
35 |
| - # tesseract-ocr-tam \ |
36 |
| - # tesseract-ocr-tel \ |
37 |
| - tesseract-ocr-fil \ |
38 |
| - # tesseract-ocr-tha \ |
39 |
| - tesseract-ocr-tur \ |
40 |
| - tesseract-ocr-ukr \ |
41 |
| - # tesseract-ocr-vie \ |
42 |
| - tesseract-ocr-nld \ |
43 |
| - tesseract-ocr-nor \ |
44 |
| - tesseract-ocr-pol \ |
45 |
| - tesseract-ocr-por \ |
46 |
| - tesseract-ocr-ron \ |
47 |
| - tesseract-ocr-rus \ |
48 |
| - tesseract-ocr-slk \ |
49 |
| - tesseract-ocr-slv \ |
50 |
| - tesseract-ocr-spa \ |
51 |
| - # tesseract-ocr-spa_old \ |
52 |
| - tesseract-ocr-sqi \ |
53 |
| - tesseract-ocr-srp \ |
54 |
| - tesseract-ocr-ind \ |
55 |
| - tesseract-ocr-isl \ |
56 |
| - tesseract-ocr-ita \ |
57 |
| - # tesseract-ocr-ita_old \ |
58 |
| - # tesseract-ocr-jpn \ |
59 |
| - tesseract-ocr-kan \ |
60 |
| - tesseract-ocr-kat \ |
61 |
| - # tesseract-ocr-kor \ |
62 |
| - tesseract-ocr-khm \ |
63 |
| - tesseract-ocr-lav \ |
64 |
| - tesseract-ocr-lit \ |
65 |
| - # tesseract-ocr-mal \ |
66 |
| - tesseract-ocr-mkd \ |
67 |
| - tesseract-ocr-mya \ |
68 |
| - tesseract-ocr-mlt \ |
69 |
| - tesseract-ocr-msa \ |
70 |
| - tesseract-ocr-est \ |
71 |
| - # tesseract-ocr-eus \ |
72 |
| - tesseract-ocr-fin \ |
73 |
| - tesseract-ocr-fra \ |
74 |
| - tesseract-ocr-frk \ |
75 |
| - # tesseract-ocr-frm \ |
76 |
| - # tesseract-ocr-glg \ |
77 |
| - # tesseract-ocr-grc \ |
78 |
| - tesseract-ocr-heb \ |
79 |
| - tesseract-ocr-hin \ |
80 |
| - tesseract-ocr-hrv \ |
81 |
| - tesseract-ocr-hye \ |
82 |
| - tesseract-ocr-hun \ |
83 |
| - # tesseract-ocr-ben \ |
84 |
| - tesseract-ocr-bul \ |
85 |
| - tesseract-ocr-cat \ |
86 |
| - tesseract-ocr-ces \ |
87 |
| - tesseract-ocr-nep \ |
88 |
| - # tesseract-ocr-chi_sim \ |
89 |
| - # tesseract-ocr-chi_tra \ |
90 |
| - # tesseract-ocr-chr \ |
91 |
| - tesseract-ocr-dan \ |
92 |
| - tesseract-ocr-deu \ |
93 |
| - tesseract-ocr-ell \ |
94 |
| - # tesseract-ocr-enm \ |
95 |
| - # tesseract-ocr-epo \ |
96 |
| - # tesseract-ocr-equ \ |
97 |
| - tesseract-ocr-afr \ |
98 |
| - tesseract-ocr-ara \ |
99 |
| - tesseract-ocr-aze \ |
100 |
| - tesseract-ocr-bel \ |
101 |
| - tesseract-ocr-uzb \ |
102 |
| - ### pdf convert: libreoffice + a bunch of fonts |
103 |
| - libreoffice fonts-opensymbol hyphen-fr hyphen-de \ |
104 |
| - hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-extra \ |
105 |
| - fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \ |
106 |
| - fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \ |
107 |
| - fonts-tlwg-purisa \ |
108 |
| - ### |
109 |
| - && apt-get -qq -y autoremove \ |
110 |
| - && apt-get clean \ |
111 |
| - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ |
112 |
| - && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 |
113 |
| - |
114 |
| -# Set up the locale and make sure the system uses unicode for the file system. |
115 |
| -ENV LANG='en_US.UTF-8' \ |
116 |
| - TZ='UTC' \ |
117 |
| - OMP_THREAD_LIMIT='1' \ |
118 |
| - OPENBLAS_NUM_THREADS='1' |
119 |
| - |
120 |
| -RUN groupadd -g 1000 -r app \ |
121 |
| - && useradd -m -u 1000 -s /bin/false -g app app |
122 |
| - |
123 |
| -# Download the ftm-typepredict model |
124 |
| -RUN mkdir /models/ && \ |
125 |
| - curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz" |
126 |
| - |
127 |
| -COPY requirements.txt /tmp/ |
128 |
| -RUN pip3 install --no-cache-dir -q -U pip setuptools |
129 |
| -RUN pip3 install --no-binary=:pyicu: pyicu |
130 |
| -RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt |
131 |
| - |
132 |
| -# Install spaCy models |
133 |
| -RUN python3 -m spacy download en_core_web_sm \ |
134 |
| - && python3 -m spacy download de_core_news_sm \ |
135 |
| - && python3 -m spacy download fr_core_news_sm \ |
136 |
| - && python3 -m spacy download es_core_news_sm |
137 |
| -RUN python3 -m spacy download ru_core_news_sm \ |
138 |
| - && python3 -m spacy download pt_core_news_sm \ |
139 |
| - && python3 -m spacy download ro_core_news_sm \ |
140 |
| - && python3 -m spacy download mk_core_news_sm |
141 |
| -RUN python3 -m spacy download el_core_news_sm \ |
142 |
| - && python3 -m spacy download pl_core_news_sm \ |
143 |
| - && python3 -m spacy download it_core_news_sm \ |
144 |
| - && python3 -m spacy download lt_core_news_sm \ |
145 |
| - && python3 -m spacy download nl_core_news_sm \ |
146 |
| - && python3 -m spacy download nb_core_news_sm \ |
147 |
| - && python3 -m spacy download da_core_news_sm |
148 |
| -# RUN python3 -m spacy download zh_core_web_sm |
| 1 | +FROM ghcr.io/openaleph/ingest-file-base:latest |
149 | 2 |
|
150 | 3 | COPY . /ingestors
|
151 | 4 | WORKDIR /ingestors
|
152 |
| -RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors |
153 |
| -RUN chown -R app:app /ingestors |
154 |
| - |
| 5 | +RUN pip3 install --no-cache-dir -r /ingestors/requirements.txt |
| 6 | +RUN pip3 install --no-cache-dir /ingestors |
155 | 7 |
|
156 | 8 | ENV ARCHIVE_TYPE=file \
|
157 | 9 | ARCHIVE_PATH=/data \
|
158 | 10 | FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
|
159 | 11 | REDIS_URL=redis://redis:6379/0 \
|
160 |
| - TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata |
161 |
| - |
162 |
| -ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libgomp.so.1" |
| 12 | + TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata |
163 | 13 |
|
164 |
| -# USER app |
| 14 | +USER app |
165 | 15 | CMD ingestors process
|
0 commit comments