Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .formatter.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Used by "mix format"
[
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
]
26 changes: 26 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# The directory Mix will write compiled artifacts to.
/_build/

# If you run "mix test --cover", coverage assets end up here.
/cover/

# The directory Mix downloads your dependencies sources to.
/deps/

# Where third-party dependencies like ExDoc output generated docs.
/doc/

# If the VM crashes, it generates a dump, let's ignore it too.
erl_crash.dump

# Also ignore archive artifacts (built via "mix archive.build").
*.ez

# Ignore package tarball (built via "mix hex.build").
fekex-*.tar

# Temporary files, for example, from tests.
/tmp/

.tool-versions
data/
175 changes: 175 additions & 0 deletions lib/fek_fetcher.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
defmodule FekFetcher do
defmodule FekSearchResult do
@enforce_keys [
:document_number,
:id,
:issue_date,
:issue_group_id,
:primary_label,
:publication_date
]
defstruct [
:document_number,
:id,
:issue_date,
:issue_group_id,
:primary_label,
:publication_date
]

@type t :: %__MODULE__{
document_number: String.t(),
id: String.t(),
issue_date: String.t(),
issue_group_id: String.t(),
primary_label: String.t(),
publication_date: String.t()
}

@spec from_search_result(map()) :: {:ok, t()} | {:error, String.t()}
def from_search_result(data) do
with {:ok, document_number} <- Map.fetch(data, "search_DocumentNumber"),
{:ok, id} <- Map.fetch(data, "search_ID"),
{:ok, issue_date} <- Map.fetch(data, "search_IssueDate"),
{:ok, issue_group_id} <- Map.fetch(data, "search_IssueGroupID"),
{:ok, primary_label} <- Map.fetch(data, "search_PrimaryLabel"),
{:ok, publication_date} <- Map.fetch(data, "search_PublicationDate") do
{:ok,
%__MODULE__{
document_number: document_number,
id: id,
issue_date: issue_date,
issue_group_id: issue_group_id,
primary_label: primary_label,
publication_date: publication_date
}}
else
:error -> {:error, "Missing required fields in search result"}
end
end

def issue_year(%__MODULE__{issue_date: issue_date}) do
[date, _time] = issue_date |> String.split(" ")
[_month, _day, issue_year] = date |> String.split("/")

issue_year
end
end

@base_url "https://searchetv99.azurewebsites.net/api/"
@pdf_base_url "https://ia37rg02wpsa01.blob.core.windows.net/fek/"

defp get(url) do
case Req.get(url) do
{:ok, %Req.Response{status: 200, body: body}} ->
{:ok, body}

{:ok, %Req.Response{status: status}} ->
{:error, "Request failed with status #{status}"}

{:error, reason} ->
{:error, "Request failed: #{reason}"}
end
end

defp post(url, data) do
case Req.post(url, json: data) do
{:ok, %Req.Response{status: 200, body: body}} ->
{:ok, body}

{:ok, %Req.Response{status: status}} ->
{:error, "Request failed with status #{status}"}

{:error, reason} ->
{:error, "Request failed: #{reason}"}
end
end

defp pdf_url(%FekSearchResult{} = fek) do
document_number = fek.document_number |> String.pad_leading(5, "0")
issue_group_id = fek.issue_group_id |> String.pad_leading(2, "0")
issue_year = FekSearchResult.issue_year(fek)

filename = "#{issue_year}#{issue_group_id}#{document_number}"

"#{@pdf_base_url}#{issue_group_id}/#{issue_year}/#{filename}.pdf"
end


defp base_fek_path(%FekSearchResult{} = fek, base_path) do
year = FekSearchResult.issue_year(fek)
primary_label = fek.primary_label |> String.replace("/", "_")

Path.join(base_path, year) |> Path.join(primary_label)
end

defp spawn_tasks_to_get_json_metadata(%FekSearchResult{} = fek, dst) do
async_process = fn filename, url_path ->
Task.async(fn ->
file_path = Path.join(dst, filename)
with {:ok, %{"data" => data}} <- get(@base_url <> url_path) do
File.write!(file_path, data)
end
end)
end

[
%{filename: "metadata.json", path: "/documententitybyid/#{fek.id}"},
%{filename: "timeline.json", path: "/timeline/#{fek.id}/0"},
%{filename: "named_entity.json", path: "/namedentity/#{fek.id}"},
%{filename: "tags.json", path: "/tagsbydocumententity/#{fek.id}"}
]
|> Enum.map(&async_process.(&1.filename, &1.path))
end

defp get_pdf(%FekSearchResult{} = fek, dst) do
pdf_url = pdf_url(fek)

with {:ok, pdf_data} <- get(pdf_url) do
file_path = Path.join(dst, "#{fek.document_number}.pdf")
File.write!(file_path, pdf_data)
{:ok, file_path}
else
{:error, reason} ->
{:error, "Failed to download PDF: #{reason}"}
end
end

def process_fek_result(%FekSearchResult{} = fek, dst) do
fek_path = base_fek_path(fek, dst)
File.mkdir_p!(fek_path)

tasks = spawn_tasks_to_get_json_metadata(fek, fek_path)
tasks = tasks ++ [Task.async(fn -> get_pdf(fek, fek_path) end)]
Task.await_many(tasks, :infinity)

{:ok, fek_path}
end


def search_feks(years, issues) do
data = %{
"selectYear" => years,
"selectIssue" => issues
}

with {:ok, %{"data" => data}} <- post(@base_url <> "simplesearch", data),
{:ok, parsed} <- JSON.decode(data) do
feks =
Enum.map(parsed, fn item ->
case FekSearchResult.from_search_result(item) do
{:ok, fek} -> fek
{:error, reason} -> {:error, reason}
end
end)
Enum.filter(feks, fn
{:error, _} -> false
_ -> true
end)

{:ok, feks}
else
{:error, reason} -> {:error, "Failed to fetch fek IDs: #{reason}"}
end
end
end
18 changes: 18 additions & 0 deletions lib/fekex.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
defmodule Fekex do
@moduledoc """
Documentation for `Fekex`.
"""

@doc """
Hello world.

## Examples

iex> Fekex.hello()
:world

"""
def hello do
:world
end
end
28 changes: 28 additions & 0 deletions mix.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
defmodule Fekex.MixProject do
use Mix.Project

def project do
[
app: :fekex,
version: "0.1.0",
elixir: "~> 1.18",
start_permanent: Mix.env() == :prod,
deps: deps(),
]
end

# Run "mix help compile.app" to learn about applications.
def application do
[
extra_applications: [:logger]
]
end

# Run "mix help deps" to learn about dependencies.
defp deps do
[
{:req, "~> 0.5.0"},
{:progress_bar, "~> 3.0"}
]
end
end
23 changes: 23 additions & 0 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
%{
"certifi": {:hex, :certifi, "2.15.0", "0e6e882fcdaaa0a5a9f2b3db55b1394dba07e8d6d9bcad08318fb604c6839712", [:rebar3], [], "hexpm", "b147ed22ce71d72eafdad94f055165c1c182f61a2ff49df28bcc71d1d5b94a60"},
"decimal": {:hex, :decimal, "2.3.0", "3ad6255aa77b4a3c4f818171b12d237500e63525c2fd056699967a3e7ea20f62", [:mix], [], "hexpm", "a4d66355cb29cb47c3cf30e71329e58361cfcb37c34235ef3bf1d7bf3773aeac"},
"file_system": {:hex, :file_system, "1.1.0", "08d232062284546c6c34426997dd7ef6ec9f8bbd090eb91780283c9016840e8f", [:mix], [], "hexpm", "bfcf81244f416871f2a2e15c1b515287faa5db9c6bcf290222206d120b3d43f6"},
"finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"},
"hackney": {:hex, :hackney, "1.24.1", "f5205a125bba6ed4587f9db3cc7c729d11316fa8f215d3e57ed1c067a9703fa9", [:rebar3], [{:certifi, "~> 2.15.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.4", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "f4a7392a0b53d8bbc3eb855bdcc919cd677358e65b2afd3840b5b3690c4c8a39"},
"hpax": {:hex, :hpax, "1.0.3", "ed67ef51ad4df91e75cc6a1494f851850c0bd98ebc0be6e81b026e765ee535aa", [:mix], [], "hexpm", "8eab6e1cfa8d5918c2ce4ba43588e894af35dbd8e91e6e55c817bca5847df34a"},
"httpoison": {:hex, :httpoison, "2.2.3", "a599d4b34004cc60678999445da53b5e653630651d4da3d14675fedc9dd34bd6", [:mix], [{:hackney, "~> 1.21", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "fa0f2e3646d3762fdc73edb532104c8619c7636a6997d20af4003da6cfc53e53"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"},
"mimerl": {:hex, :mimerl, "1.4.0", "3882a5ca67fbbe7117ba8947f27643557adec38fa2307490c4c4207624cb213b", [:rebar3], [], "hexpm", "13af15f9f68c65884ecca3a3891d50a7b57d82152792f3e19d88650aa126b144"},
"mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"},
"nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"},
"nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
"progress_bar": {:hex, :progress_bar, "3.0.0", "f54ff038c2ac540cfbb4c2bfe97c75e7116ead044f3c2b10c9f212452194b5cd", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6981c2b25ab24aecc91a2dc46623658e1399c21a2ae24db986b90d678530f2b7"},
"req": {:hex, :req, "0.5.10", "a3a063eab8b7510785a467f03d30a8d95f66f5c3d9495be3474b61459c54376c", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "8a604815743f8a2d3b5de0659fa3137fa4b1cffd636ecb69b30b2b9b2c2559be"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
"telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.7.1", "a48703a25c170eedadca83b11e88985af08d35f37c6f664d6dcfb106a97782fc", [:rebar3], [], "hexpm", "b3a917854ce3ae233619744ad1e0102e05673136776fb2fa76234f3e03b23642"},
}
13 changes: 13 additions & 0 deletions scripts/fetch.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
dst = "data"
years = System.argv()
{:ok, results} = FekFetcher.search_feks(years, ["2"])

total = Enum.count(results)
File.mkdir_p!(dst)

results
|> Task.async_stream(fn fek -> FekFetcher.process_fek_result(fek, dst) end, timeout: :infinity, max_concurrency: 5)
|> Stream.with_index()
|> Enum.each(fn {_result, index} ->
ProgressBar.render(index, total)
end)
8 changes: 8 additions & 0 deletions test/fekex_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
defmodule FekexTest do
use ExUnit.Case
doctest Fekex

test "greets the world" do
assert Fekex.hello() == :world
end
end
1 change: 1 addition & 0 deletions test/test_helper.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ExUnit.start()