Skip to content

Commit 08951f5

Browse files
authored
Setup initial tooling and templates (#1)
1 parent 0b0da94 commit 08951f5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1238
-2
lines changed

.circleci/config.template.yml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{{ config_header }}
2+
version: 2.1
3+
4+
orbs:
5+
gcp-gcr: circleci/[email protected]
6+
7+
commands:
8+
compare-branch:
9+
description: Compare current branch with main
10+
parameters:
11+
pattern:
12+
type: string
13+
steps:
14+
- run:
15+
name: Compare current branch with main
16+
command: |
17+
if [ "$CIRCLE_BRANCH" = main ]; then
18+
echo "Run tests because branch is main"
19+
elif git log --format=%B --no-merges -n 1 | grep -qF '[run-tests]'; then
20+
echo "Run tests because [run-tests] in commit message"
21+
elif git diff --name-only ..origin | egrep -q '<< parameters.pattern >>'; then
22+
echo "Run tests because << parameters.pattern >> was modified since branching off main"
23+
else
24+
echo "Skipping tests because << parameters.pattern >> was not modified"
25+
circleci step halt
26+
fi
27+
28+
jobs:
29+
build-docker-etl:
30+
docker:
31+
- image: docker:stable-git
32+
steps:
33+
- checkout
34+
- setup_remote_docker:
35+
version: 19.03.13
36+
- run:
37+
name: Build Docker image
38+
command: docker build -t docker-etl:build .
39+
- run:
40+
name: Test Code
41+
command: docker run docker-etl:build pytest --black --flake8 docker_etl/ tests/
42+
- run:
43+
name: Verify jobs have required files
44+
command: docker run docker-etl:build script/verify_files
45+
- run:
46+
name: Verify CI config is up-to-date
47+
command: docker run docker-etl:build python3 -m docker_etl.ci_config --dry-run | diff -B .circleci/config.yml -
48+
49+
{{ jobs | indent(2, True) }}
50+
51+
workflows:
52+
docker-etl:
53+
jobs:
54+
- build-docker-etl
55+
56+
{{ workflows | indent(2, True) }}
57+

.circleci/config.yml

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
###
2+
# This config.yml was generated by docker-etl/ci_config.py.
3+
# Changes should be made to templates/config.template.yml and re-generated.
4+
###
5+
version: 2.1
6+
7+
orbs:
8+
gcp-gcr: circleci/[email protected]
9+
10+
commands:
11+
compare-branch:
12+
description: Compare current branch with main
13+
parameters:
14+
pattern:
15+
type: string
16+
steps:
17+
- run:
18+
name: Compare current branch with main
19+
command: |
20+
if [ "$CIRCLE_BRANCH" = main ]; then
21+
echo "Run tests because branch is main"
22+
elif git log --format=%B --no-merges -n 1 | grep -qF '[run-tests]'; then
23+
echo "Run tests because [run-tests] in commit message"
24+
elif git diff --name-only ..origin | egrep -q '<< parameters.pattern >>'; then
25+
echo "Run tests because << parameters.pattern >> was modified since branching off main"
26+
else
27+
echo "Skipping tests because << parameters.pattern >> was not modified"
28+
circleci step halt
29+
fi
30+
31+
jobs:
32+
build-docker-etl:
33+
docker:
34+
- image: docker:stable-git
35+
steps:
36+
- checkout
37+
- setup_remote_docker:
38+
version: 19.03.13
39+
- run:
40+
name: Build Docker image
41+
command: docker build -t docker-etl:build .
42+
- run:
43+
name: Test Code
44+
command: docker run docker-etl:build pytest --black --flake8 docker_etl/ tests/
45+
- run:
46+
name: Verify jobs have required files
47+
command: docker run docker-etl:build script/verify_files
48+
- run:
49+
name: Verify CI config is up-to-date
50+
command: docker run docker-etl:build python3 -m docker_etl.ci_config --dry-run | diff -B .circleci/config.yml -
51+
52+
build-job-example_job:
53+
docker:
54+
- image: docker:stable-git
55+
steps:
56+
- checkout
57+
- compare-branch:
58+
pattern: ^jobs/example_job/
59+
- setup_remote_docker:
60+
version: 19.03.13
61+
- run:
62+
name: Build Docker image
63+
command: docker build -t app:build jobs/example_job/
64+
65+
66+
workflows:
67+
docker-etl:
68+
jobs:
69+
- build-docker-etl
70+
71+
job-example_job:
72+
jobs:
73+
- build-job-example_job
74+
- gcp-gcr/build-and-push-image:
75+
context: data-eng-airflow-gcr
76+
path: jobs/example_job/
77+
image: example_job_docker_etl
78+
requires:
79+
- build-job-example_job
80+
filters:
81+
branches:
82+
only: main
83+

.dockerignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.DS_Store
2+
*.pyc
3+
__pycache__/
4+
.pytest_cache/
5+
venv/

.flake8

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[flake8]
2+
max-line-length = 88

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.DS_Store
2+
venv/

Dockerfile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
FROM python:3.8
2+
MAINTAINER REPLACE ME <[email protected]>
3+
4+
# https://github.com/mozilla-services/Dockerflow/blob/master/docs/building-container.md
5+
ARG USER_ID="10001"
6+
ARG GROUP_ID="app"
7+
ARG HOME="/app"
8+
9+
ENV HOME=${HOME}
10+
RUN groupadd --gid ${USER_ID} ${GROUP_ID} && \
11+
useradd --create-home --uid ${USER_ID} --gid ${GROUP_ID} --home-dir ${HOME} ${GROUP_ID}
12+
13+
WORKDIR ${HOME}
14+
15+
RUN pip install --upgrade pip
16+
17+
COPY requirements.txt .
18+
RUN pip install -r requirements.txt
19+
20+
COPY . .
21+
22+
RUN pip install .
23+
24+
# Drop root and change ownership of the application folder to the user
25+
RUN chown -R ${USER_ID}:${GROUP_ID} ${HOME}
26+
USER ${USER_ID}

README.md

Lines changed: 137 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,137 @@
1-
# data-etl
2-
ETL jobs managed by data engineering
1+
# Docker ETL
2+
3+
This repo is a collection of dockerized ETL jobs to increase discoverability
4+
of the source code of scheduled ETL.
5+
There are also tools here that automate the common steps involved with creating and
6+
scheduling an ETL job.
7+
This includes defining a Docker image, setting up CI, and language boilerplate.
8+
The primary use of this repo is to create Dockerized jobs that are pushed to GCR
9+
so they can be scheduled via the Airflow GKE pod operator.
10+
11+
## Project Structure
12+
13+
### Jobs
14+
15+
Each job is located in its own directory in the `jobs/` directory,
16+
e.g. the contents of a job named `my-job` would go into `jobs/my-job`
17+
18+
All job directories should have a `Dockerfile`, a `ci_job.yaml`,
19+
a `ci_workflow.yaml`, and a `README.md` in the root directory.
20+
`ci_job.yaml` and `ci_workflow.yaml` contain the yaml structure that will be placed
21+
in the `- jobs:` and `- workflows:` sections of the CircleCI `config.yml` respectively
22+
23+
### Templates
24+
25+
Templates for job creation and the CI config file are located in `templates/`.
26+
27+
The CI config template is in `.circleci/config.template.yml`.
28+
This is the file that should be modified instead of the `circleci/config.yml`.
29+
30+
Each job template is located in a directory in `templates/` that is the name of the template,
31+
e.g. a `python` template is in `templates/python/`.
32+
Within the directory of a template is a directory named `job/` that contains
33+
all the contents that will be copied when the template is used.
34+
Other files in the directory of a particular template are used for
35+
job creation, e.g. `ci_job.template.yaml`.
36+
37+
### Example Directory Structure:
38+
39+
```
40+
+--docker-etl/
41+
| +--jobs/
42+
| +--example-python-1/
43+
| +--ci_job.yaml
44+
| +--ci_workflow.yaml
45+
| +--Dockerfile
46+
| +--README.md
47+
| +--script
48+
| +--templates/
49+
| +--python/
50+
| +--job/
51+
| +--module/
52+
| +--tests/
53+
| +--Dockerfile
54+
| +--README.md
55+
| +--requirements.txt
56+
| +--ci_job.template.yaml
57+
| +--ci_workflow.template.yaml
58+
59+
```
60+
61+
## Development
62+
63+
The tools in this repository are intended for python 3.8+.
64+
65+
To install dependencies:
66+
67+
```sh
68+
pip install -r requirements.txt
69+
```
70+
71+
This project uses `pip-tools` to pin dependencies. New dependencies go in
72+
`requirements.in` and `pip-compile` is used to generate `requirements.txt`:
73+
74+
```sh
75+
pip install pip-tools
76+
pip-compile --generate-hashes requirements.in
77+
```
78+
79+
To run tests:
80+
81+
```sh
82+
pytest --flake8 --black tests/
83+
```
84+
85+
### Adding a new job
86+
87+
To add a new job:
88+
89+
```sh
90+
./script/create_job --job-name example-job --template python
91+
```
92+
93+
`job-name` is the name of the directory that will be created in `jobs/`.
94+
95+
`template` is an optional argument that will populate the created directory
96+
with the contents of a template.
97+
If no template is given, a directory with only the required files is created.
98+
99+
#### Available Templates:
100+
101+
| Template name | Description |
102+
| ------------- | ----------- |
103+
| default | Base directory with readme, Dockerfile, and CI config files |
104+
| python | Simple Python module with unit test and lint config |
105+
106+
### Modifying the CI config
107+
108+
This repo uses CircleCI which only allows a single global config file.
109+
In order to simplify adding and removing jobs to CI, the config file is
110+
generated using templates.
111+
This means the `config.yml` in `.circleci/` should not be modified directly.
112+
113+
Generate `.circleci/config.yml`:
114+
115+
```sh
116+
./script/update_ci_config
117+
```
118+
119+
To make changes to the config that are not ETL job specific
120+
(e.g. add a command), changes should be made to `templates/config.template.yml`
121+
and the output config should be re-generated.
122+
123+
Each job has a `ci_job.yaml` and a `ci_workflow.yaml` which define the steps
124+
that will go into the jobs and workflow sections of the CircleCI config.
125+
Any changes to these files should be followed by updating the global config
126+
via `scripts/update_ci_config`.
127+
When a job is created, the CI files are created based on the
128+
`ci_*.template.yaml` files in the template directory.
129+
130+
### Adding a template
131+
132+
To add a new template, create a new directory in `templates/` with the name
133+
of the template.
134+
This directory must have a `ci_job.template.yaml`, a `ci_workflow.template.yaml`,
135+
and a `job/` directory which contains all the files that will be copied to
136+
any job that uses this template.
137+

docker_etl/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)