-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSplitSentences.sh
More file actions
34 lines (30 loc) · 912 Bytes
/
SplitSentences.sh
File metadata and controls
34 lines (30 loc) · 912 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env bash
if [ -z "$1" ] || [ -z "$2" ] ; then
echo "Split input corpus into one sentence per line, make the sentences unique, remove html."
echo "parameters: <corpus-directory> <output-directory>"
exit
fi
# Process input params
corpus=$1
output=$2
make_uniq=true
compress_output=false
jars=`$bin_hadoop | tr " " ","`
path=`$bin_hadoop | tr " " ":"`
HADOOP_CLASSPATH=$path hadoop \
de.uhh.lt.lefex.SentenceSplitter.HadoopMain \
-libjars $jars \
-Dmapreduce.reduce.failures.maxpercent=10 \
-Dmapreduce.map.failures.maxpercent=10 \
-Dmapreduce.job.queuename=default \
-Dmapreduce.map.java.opts=-Xmx4g \
-Dmapreduce.map.memory.mp=4096 \
-Dmapreduce.reduce.java.opts=-Xmx8g \
-Dmapreduce.reduce.memory.mb=8192 \
-Dtokenize=true \
-Dmax_sentence_size=110 \
-Dstrip_html=true \
$corpus \
$output \
$make_uniq \
$compress_output