Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions infra/reverse_proxy/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
version: '2'
services:
app:
image: nginx
volumes:
- ./reverse.conf:/etc/nginx/conf.d/default.conf
ports:
- "8080:80"
50 changes: 50 additions & 0 deletions infra/reverse_proxy/reverse.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
server {
listen 80;
listen [::]:80;
server_name localhost;

#charset koi8-r;
#access_log /var/log/nginx/host.access.log main;

# location / {
# root /usr/share/nginx/html;
# index index.html index.htm;
# }

#error_page 404 /404.html;

# redirect server error pages to the static page /50x.html
#
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root /usr/share/nginx/html;
}

location / {
proxy_pass https://portaldjbr.contraloria.gov.py/;
}

# proxy the PHP scripts to Apache listening on 127.0.0.1:80
#
#location ~ \.php$ {
# proxy_pass http://127.0.0.1;
#}

# pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
#
#location ~ \.php$ {
# root html;
# fastcgi_pass 127.0.0.1:9000;
# fastcgi_index index.php;
# fastcgi_param SCRIPT_FILENAME /scripts$fastcgi_script_name;
# include fastcgi_params;
#}

# deny access to .htaccess files, if Apache's document root
# concurs with nginx's one
#
#location ~ /\.ht {
# deny all;
#}
}

Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

dag_job_target_dir = os.path.join(Variable.get("CGR_PDF_FOLDER", os.path.join(os.sep, "tmp", "contralory", "raw")))
dag_sub_jobs_count = int(Variable.get("CGR_DOWNLOAD_PDF_SUB_JOBS_COUNT", 10))
dag_base_url = Variable.get("CGR_DOWNLOAD_PDF_BASE_URL", "https://portaldjbr.contraloria.gov.py/portal-djbr/api/consulta/descargarpdf/")

default_args = {
"owner": "airflow",
Expand All @@ -47,7 +48,6 @@
"email_on_retry": False,
"retry_delay": timedelta(hours=1),
"params": {
"url": "https://portaldjbr.contraloria.gov.py/portal-djbr/api/consulta/descargarpdf/",
"target_dir": dag_job_target_dir
},
}
Expand Down Expand Up @@ -237,7 +237,7 @@ def do_work(number: int, url: str, target_dir: str, mod_of: int):
python_callable=do_work,
op_kwargs={
"number": digit,
"url": "{{ params.url }}",
"url": dag_base_url,
"target_dir": "{{ params.target_dir }}",
"mod_of": dag_sub_jobs_count
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import requests
from re import findall as re_findall
from airflow.hooks.postgres_hook import PostgresHook
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.postgres_operator import PostgresOperator
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.models import DAG, Variable

dag_base_url = Variable.get("CGR_FETCHER_BASE_URL", "https://portaldjbr.contraloria.gov.py/portal-djbr/api/consulta/declaraciones/paginadas")

default_args = {
"owner": "airflow",
Expand All @@ -18,7 +20,6 @@
"email_on_retry": False,
"retry_delay": timedelta(hours=1),
"params": {
"url": "https://portaldjbr.contraloria.gov.py/portal-djbr/api/consulta/declaraciones/paginadas",
},
}

Expand Down Expand Up @@ -82,7 +83,6 @@ def list_navigator(base_query: str, url: str):
yield records
should_continue = len(records) != 0
page += 1
# should_continue = False


def get_upsert_query() -> str:
Expand Down Expand Up @@ -145,15 +145,14 @@ def fetch_list(letter: str, url: str, **kwargs):
''')

for letter in ['a', 'e', 'i', 'o', 'u']:
# for letter in ['a']:
# Get list from webpage
get_pdf_list = PythonOperator(
task_id=f"""get_pdf_list_{letter}""",
provide_context=True,
python_callable=fetch_list,
op_kwargs={
"letter": letter,
"url": "{{ params.url }}",
"url": dag_base_url,
},
)

Expand Down