-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstartup.sh
More file actions
57 lines (36 loc) · 2.27 KB
/
startup.sh
File metadata and controls
57 lines (36 loc) · 2.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env bash
# this is a bash script to bootstart the project including downloading of data from wikidata and setup of additional tools.
################################
# Downloading Wikidata Triples #
################################
echo "-> downloading wikidata dumps..."
mkdir dataset
cd dataset
# triples
echo "-> download wikidata facts triples statements from wikidata truthy dump .."
wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-truthy.nt.bz2
echo "-> make csv file out of nt .."
##Making the file smaller -> 100000000 lines
echo "-> Shortening the triples file..."
bzcat latest-truthy.nt.bz2 | head -n 100000000 > wikidata-sample.nt
## skipping labels and meta information and keep only wikidata props
echo "-> Skipping labels and meta information..."
grep "/prop/direct/P" wikidata-sample.nt | sed -E 's/[<>"]//g'| sed -E 's/@.+//g' | cut -d" " -f1-3 | sed -E 's/\s/\t/g' | sed -e 's/\(http:\/\/www.wikidata.org\/prop\/direct\/\|http:\/\/www.wikidata.org\/entity\/\)//g' > wikidata_triples.csv
echo "-> Filtering the triples..."
## Filtering the triples. First keep only Qxxx objects, then if we need to select the wikidata relations/properties to work with
##example use: python filter_triples.py wikidata_triples.csv -c "P17,P50,P106" -s 0 -e 3000 -f "wikidata_triples_final.csv"
python filter_triples.py wikidata_triples.csv -f "wikidata_triples_final.csv"
## check the number of triples per relation
cut -f2 wikidata_triples_final.csv | sort | uniq -c | sort -nr > nrelations_teste_final.txt
echo "-> Interconecting Wikidata Triples with Wikipedia pages..."
##example use: python dataset.py wikidata_triples_final.csv -o "dataset.jsonl" -f "filtered_triples.csv"
## -o -> output dataset | -f -> triples that originated the dataset
python dataset.py wikidata_triples_final.csv
echo "-> Applying the NLI Step..."
##example use: python NLI.py dataset.jsonl -o "dataset_NLI.jsonl"
##-o -> output dataset after the NLI step
python NLI.py dataset.jsonl
echo "-> Filtering the dataset by the NLI entailment score (Keep higher than 0.95) and cleaning..."
##example use: python NLI_filter.py dataset_NLI.jsonl -o "dataset_NLI_Final.jsonl"
##-o -> output dataset after the entailment filtering and cleaning -> Final dataset File
python NLI_filter.py dataset_NLI.jsonl