|
1 | | -FROM ubuntu:20.04 |
| 1 | +FROM python:3.9-bookworm |
2 | 2 | ENV DEBIAN_FRONTEND noninteractive |
3 | 3 |
|
4 | 4 | LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors" |
5 | 5 | LABEL org.opencontainers.image.licenses MIT |
6 | 6 | LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file |
7 | 7 |
|
8 | 8 | # Enable non-free archive for `unrar`. |
9 | | -# RUN echo "deb http://http.us.debian.org/debian stretch non-free" >/etc/apt/sources.list.d/nonfree.list |
10 | | -RUN apt-get -qq -y update \ |
11 | | - && apt-get -qq -y install build-essential locales ca-certificates \ |
12 | | - # git |
13 | | - git \ |
14 | | - # python deps (mostly to install their dependencies) |
15 | | - python3-pip python3-dev python3-pil \ |
16 | | - # tesseract |
17 | | - tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\ |
18 | | - # libraries |
19 | | - libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \ |
20 | | - zlib1g-dev libicu-dev libxml2-dev \ |
21 | | - # package tools |
22 | | - unrar p7zip-full \ |
23 | | - # audio & video metadata |
24 | | - libmediainfo-dev \ |
25 | | - # image processing, djvu |
26 | | - imagemagick-common imagemagick mdbtools djvulibre-bin \ |
27 | | - libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \ |
28 | | - libtiff-tools ghostscript librsvg2-bin jbig2dec \ |
29 | | - pst-utils \ |
30 | | - ### tesseract |
31 | | - tesseract-ocr-eng \ |
32 | | - tesseract-ocr-swa \ |
33 | | - tesseract-ocr-swe \ |
34 | | - # tesseract-ocr-tam \ |
35 | | - # tesseract-ocr-tel \ |
36 | | - tesseract-ocr-fil \ |
37 | | - # tesseract-ocr-tha \ |
38 | | - tesseract-ocr-tur \ |
39 | | - tesseract-ocr-ukr \ |
40 | | - # tesseract-ocr-vie \ |
41 | | - tesseract-ocr-nld \ |
42 | | - tesseract-ocr-nor \ |
43 | | - tesseract-ocr-pol \ |
44 | | - tesseract-ocr-por \ |
45 | | - tesseract-ocr-ron \ |
46 | | - tesseract-ocr-rus \ |
47 | | - tesseract-ocr-slk \ |
48 | | - tesseract-ocr-slv \ |
49 | | - tesseract-ocr-spa \ |
50 | | - # tesseract-ocr-spa_old \ |
51 | | - tesseract-ocr-sqi \ |
52 | | - tesseract-ocr-srp \ |
53 | | - tesseract-ocr-ind \ |
54 | | - tesseract-ocr-isl \ |
55 | | - tesseract-ocr-ita \ |
56 | | - # tesseract-ocr-ita_old \ |
57 | | - # tesseract-ocr-jpn \ |
58 | | - tesseract-ocr-kan \ |
59 | | - tesseract-ocr-kat \ |
60 | | - # tesseract-ocr-kor \ |
61 | | - tesseract-ocr-khm \ |
62 | | - tesseract-ocr-lav \ |
63 | | - tesseract-ocr-lit \ |
64 | | - # tesseract-ocr-mal \ |
65 | | - tesseract-ocr-mkd \ |
66 | | - tesseract-ocr-mya \ |
67 | | - tesseract-ocr-mlt \ |
68 | | - tesseract-ocr-msa \ |
69 | | - tesseract-ocr-est \ |
70 | | - # tesseract-ocr-eus \ |
71 | | - tesseract-ocr-fin \ |
72 | | - tesseract-ocr-fra \ |
73 | | - tesseract-ocr-frk \ |
74 | | - # tesseract-ocr-frm \ |
75 | | - # tesseract-ocr-glg \ |
76 | | - # tesseract-ocr-grc \ |
77 | | - tesseract-ocr-heb \ |
78 | | - tesseract-ocr-hin \ |
79 | | - tesseract-ocr-hrv \ |
80 | | - tesseract-ocr-hye \ |
81 | | - tesseract-ocr-hun \ |
82 | | - # tesseract-ocr-ben \ |
83 | | - tesseract-ocr-bul \ |
84 | | - tesseract-ocr-cat \ |
85 | | - tesseract-ocr-ces \ |
86 | | - tesseract-ocr-nep \ |
87 | | - # tesseract-ocr-chi_sim \ |
88 | | - # tesseract-ocr-chi_tra \ |
89 | | - # tesseract-ocr-chr \ |
90 | | - tesseract-ocr-dan \ |
91 | | - tesseract-ocr-deu \ |
92 | | - tesseract-ocr-ell \ |
93 | | - # tesseract-ocr-enm \ |
94 | | - # tesseract-ocr-epo \ |
95 | | - # tesseract-ocr-equ \ |
96 | | - tesseract-ocr-afr \ |
97 | | - tesseract-ocr-ara \ |
98 | | - tesseract-ocr-aze \ |
99 | | - tesseract-ocr-bel \ |
100 | | - tesseract-ocr-uzb \ |
101 | | - ### pdf convert: libreoffice + a bunch of fonts |
102 | | - libreoffice fonts-opensymbol hyphen-fr hyphen-de \ |
103 | | - hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \ |
104 | | - fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \ |
105 | | - fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \ |
106 | | - fonts-tlwg-purisa \ |
107 | | - ### |
108 | | - && apt-get -qq -y autoremove \ |
109 | | - && apt-get clean \ |
110 | | - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ |
111 | | - && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 |
| 9 | +RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \ |
| 10 | + && apt-get -qq -y update \ |
| 11 | + && apt-get -qq -y install build-essential locales \ |
| 12 | + # python deps (mostly to install their dependencies) |
| 13 | + python3-dev \ |
| 14 | + # tesseract |
| 15 | + tesseract-ocr libtesseract-dev libleptonica-dev \ |
| 16 | + # libraries |
| 17 | + libldap2-dev libsasl2-dev \ |
| 18 | + # package tools |
| 19 | + unrar p7zip-full \ |
| 20 | + # audio & video metadata |
| 21 | + libmediainfo-dev \ |
| 22 | + # image processing, djvu |
| 23 | + mdbtools djvulibre-bin \ |
| 24 | + libtiff5-dev \ |
| 25 | + libtiff-tools ghostscript librsvg2-bin jbig2dec \ |
| 26 | + pst-utils libgif-dev \ |
| 27 | + ### tesseract |
| 28 | + tesseract-ocr-eng \ |
| 29 | + tesseract-ocr-swa \ |
| 30 | + tesseract-ocr-swe \ |
| 31 | + # tesseract-ocr-tam \ |
| 32 | + # tesseract-ocr-tel \ |
| 33 | + tesseract-ocr-fil \ |
| 34 | + # tesseract-ocr-tha \ |
| 35 | + tesseract-ocr-tur \ |
| 36 | + tesseract-ocr-ukr \ |
| 37 | + # tesseract-ocr-vie \ |
| 38 | + tesseract-ocr-nld \ |
| 39 | + tesseract-ocr-nor \ |
| 40 | + tesseract-ocr-pol \ |
| 41 | + tesseract-ocr-por \ |
| 42 | + tesseract-ocr-ron \ |
| 43 | + tesseract-ocr-rus \ |
| 44 | + tesseract-ocr-slk \ |
| 45 | + tesseract-ocr-slv \ |
| 46 | + tesseract-ocr-spa \ |
| 47 | + # tesseract-ocr-spa_old \ |
| 48 | + tesseract-ocr-sqi \ |
| 49 | + tesseract-ocr-srp \ |
| 50 | + tesseract-ocr-ind \ |
| 51 | + tesseract-ocr-isl \ |
| 52 | + tesseract-ocr-ita \ |
| 53 | + # tesseract-ocr-ita_old \ |
| 54 | + # tesseract-ocr-jpn \ |
| 55 | + tesseract-ocr-kan \ |
| 56 | + tesseract-ocr-kat \ |
| 57 | + # tesseract-ocr-kor \ |
| 58 | + tesseract-ocr-khm \ |
| 59 | + tesseract-ocr-lav \ |
| 60 | + tesseract-ocr-lit \ |
| 61 | + # tesseract-ocr-mal \ |
| 62 | + tesseract-ocr-mkd \ |
| 63 | + tesseract-ocr-mya \ |
| 64 | + tesseract-ocr-mlt \ |
| 65 | + tesseract-ocr-msa \ |
| 66 | + tesseract-ocr-est \ |
| 67 | + # tesseract-ocr-eus \ |
| 68 | + tesseract-ocr-fin \ |
| 69 | + tesseract-ocr-fra \ |
| 70 | + tesseract-ocr-frk \ |
| 71 | + # tesseract-ocr-frm \ |
| 72 | + # tesseract-ocr-glg \ |
| 73 | + # tesseract-ocr-grc \ |
| 74 | + tesseract-ocr-heb \ |
| 75 | + tesseract-ocr-hin \ |
| 76 | + tesseract-ocr-hrv \ |
| 77 | + tesseract-ocr-hye \ |
| 78 | + tesseract-ocr-hun \ |
| 79 | + # tesseract-ocr-ben \ |
| 80 | + tesseract-ocr-bul \ |
| 81 | + tesseract-ocr-cat \ |
| 82 | + tesseract-ocr-ces \ |
| 83 | + tesseract-ocr-nep \ |
| 84 | + # tesseract-ocr-chi_sim \ |
| 85 | + # tesseract-ocr-chi_tra \ |
| 86 | + # tesseract-ocr-chr \ |
| 87 | + tesseract-ocr-dan \ |
| 88 | + tesseract-ocr-deu \ |
| 89 | + tesseract-ocr-ell \ |
| 90 | + # tesseract-ocr-enm \ |
| 91 | + # tesseract-ocr-epo \ |
| 92 | + # tesseract-ocr-equ \ |
| 93 | + tesseract-ocr-afr \ |
| 94 | + tesseract-ocr-ara \ |
| 95 | + tesseract-ocr-aze \ |
| 96 | + tesseract-ocr-bel \ |
| 97 | + tesseract-ocr-uzb \ |
| 98 | + ### pdf convert: libreoffice + a bunch of fonts |
| 99 | + libreoffice fonts-opensymbol hyphen-fr hyphen-de \ |
| 100 | + hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-extra \ |
| 101 | + fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \ |
| 102 | + fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \ |
| 103 | + fonts-tlwg-purisa \ |
| 104 | + ### |
| 105 | + && apt-get -qq -y autoremove \ |
| 106 | + && apt-get clean \ |
| 107 | + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ |
| 108 | + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 |
112 | 109 |
|
113 | 110 | # Set up the locale and make sure the system uses unicode for the file system. |
114 | 111 | ENV LANG='en_US.UTF-8' \ |
115 | | - TZ='UTC' \ |
116 | | - OMP_THREAD_LIMIT='1' \ |
117 | | - OPENBLAS_NUM_THREADS='1' |
| 112 | + TZ='UTC' \ |
| 113 | + OMP_THREAD_LIMIT='1' \ |
| 114 | + OPENBLAS_NUM_THREADS='1' |
118 | 115 |
|
119 | 116 | RUN groupadd -g 1000 -r app \ |
120 | | - && useradd -m -u 1000 -s /bin/false -g app app |
| 117 | + && useradd -m -u 1000 -s /bin/false -g app app |
121 | 118 |
|
122 | 119 | # Download the ftm-typepredict model |
123 | 120 | RUN mkdir /models/ && \ |
124 | | - curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz" |
| 121 | + curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz" |
125 | 122 |
|
126 | 123 | COPY requirements.txt /tmp/ |
127 | | -RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip |
128 | | -RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel |
129 | 124 | RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt |
130 | 125 |
|
131 | 126 | # Install spaCy models |
132 | 127 | RUN python3 -m spacy download en_core_web_sm \ |
133 | | - && python3 -m spacy download de_core_news_sm \ |
134 | | - && python3 -m spacy download fr_core_news_sm \ |
135 | | - && python3 -m spacy download es_core_news_sm |
| 128 | + && python3 -m spacy download de_core_news_sm \ |
| 129 | + && python3 -m spacy download fr_core_news_sm \ |
| 130 | + && python3 -m spacy download es_core_news_sm |
136 | 131 | RUN python3 -m spacy download ru_core_news_sm \ |
137 | | - && python3 -m spacy download pt_core_news_sm \ |
138 | | - && python3 -m spacy download ro_core_news_sm \ |
139 | | - && python3 -m spacy download mk_core_news_sm |
| 132 | + && python3 -m spacy download pt_core_news_sm \ |
| 133 | + && python3 -m spacy download ro_core_news_sm \ |
| 134 | + && python3 -m spacy download mk_core_news_sm |
140 | 135 | RUN python3 -m spacy download el_core_news_sm \ |
141 | | - && python3 -m spacy download pl_core_news_sm \ |
142 | | - && python3 -m spacy download it_core_news_sm \ |
143 | | - && python3 -m spacy download lt_core_news_sm \ |
144 | | - && python3 -m spacy download nl_core_news_sm \ |
145 | | - && python3 -m spacy download nb_core_news_sm \ |
146 | | - && python3 -m spacy download da_core_news_sm |
| 136 | + && python3 -m spacy download pl_core_news_sm \ |
| 137 | + && python3 -m spacy download it_core_news_sm \ |
| 138 | + && python3 -m spacy download lt_core_news_sm \ |
| 139 | + && python3 -m spacy download nl_core_news_sm \ |
| 140 | + && python3 -m spacy download nb_core_news_sm \ |
| 141 | + && python3 -m spacy download da_core_news_sm |
147 | 142 | # RUN python3 -m spacy download zh_core_web_sm |
148 | 143 |
|
149 | 144 | COPY . /ingestors |
150 | 145 | WORKDIR /ingestors |
151 | | -RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors |
| 146 | +RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors |
152 | 147 | RUN chown -R app:app /ingestors |
153 | 148 |
|
154 | 149 | ENV ARCHIVE_TYPE=file \ |
155 | | - ARCHIVE_PATH=/data \ |
156 | | - FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ |
157 | | - REDIS_URL=redis://redis:6379/0 \ |
158 | | - TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata \ |
159 | | - LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 |
| 150 | + ARCHIVE_PATH=/data \ |
| 151 | + FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ |
| 152 | + REDIS_URL=redis://redis:6379/0 \ |
| 153 | + TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \ |
| 154 | + LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 |
160 | 155 |
|
161 | 156 | # USER app |
162 | 157 | CMD ingestors process |
0 commit comments