diff --git a/examples-python/01SparkRDD/README.md b/examples-python/01SparkRDD/README.md new file mode 100644 index 0000000..4166288 --- /dev/null +++ b/examples-python/01SparkRDD/README.md @@ -0,0 +1,68 @@ +### 測試資料內容 +``` +a,123,456,789,11344,2142,123 +b,1234,124,1234,123,123 +c,123,4123,5435,1231,5345 +d,123,456,789,113,2142,143 +e,123,446,789,14,2142,113 +f,123,446,789,14,2142,1113,323 +``` + +### Map 練習: +* spark-hw01-map.py +找出測試資料所有的英文字母,結果如下: +``` +a +b +c +d +e +f +``` + +### FlatMap 練習: +* spark-hw02-flatmap.py +找出測試資料所有以","切割的資料 +結果如下: +``` +a +123 +456 +789 +11344 +2142 +123 +… +``` + +### filter 練習: +* spark-hw03-filter.py +找出測試資料所有以123與456的資料,結果如下: +``` +123 +456 +123 +1234 +1234 +… +``` + +### reduce練習: +* spark-hw04-reduce.py +找出測試資料所有英文字母,並用reduce將之append成一個字串,結果如下: +``` +abcdef +``` + +### reduceByKey練習: +* spark-hw05-reduceByKey.py +找出測試資料所有英文字母,並用reduce將之append成一個字串,結果如下: +``` +(d,1) +(1113,1) +(1231,1) +(e,1) +(14,2) +(113,2) +… +``` \ No newline at end of file diff --git a/examples-python/01SparkRDD/spark-hw01-map.py b/examples-python/01SparkRDD/spark-hw01-map.py new file mode 100644 index 0000000..7ce4899 --- /dev/null +++ b/examples-python/01SparkRDD/spark-hw01-map.py @@ -0,0 +1,8 @@ + + +from pyspark import SparkContext +sc = SparkContext("local", "Simple App") +textFile = sc.textFile("hdfs:/spark/hw/test.txt") + +numBs = textFile.map(lambda s: s.split(',')).map(lambda x:print(x[0])).collect() + diff --git a/examples-python/01SparkRDD/spark-hw02-flatmap.py b/examples-python/01SparkRDD/spark-hw02-flatmap.py new file mode 100644 index 0000000..247af20 --- /dev/null +++ b/examples-python/01SparkRDD/spark-hw02-flatmap.py @@ -0,0 +1,6 @@ + + +from pyspark import SparkContext +sc = SparkContext("local", "Simple App") +textFile = sc.textFile("hdfs:/spark/hw/test.txt") +numBs = textFile.flatMap(lambda s: s.split(',')).map(lambda s:print(s)).collect() diff --git a/examples-python/01SparkRDD/spark-hw03-filter.py b/examples-python/01SparkRDD/spark-hw03-filter.py new file mode 100644 index 0000000..09b9d86 --- /dev/null +++ b/examples-python/01SparkRDD/spark-hw03-filter.py @@ -0,0 +1,7 @@ + +from pyspark import SparkContext +sc = SparkContext("local", "Simple App") +textFile = sc.textFile("hdfs:/spark/hw/test.txt") +numBs = textFile.flatMap(lambda s: s.split(','))\ + .filter(lambda line:"123"in line or "456" in line)\ + .map(lambda w:print(w)).collect() diff --git a/examples-python/01SparkRDD/spark-hw04-reduce.py b/examples-python/01SparkRDD/spark-hw04-reduce.py new file mode 100644 index 0000000..855d43d --- /dev/null +++ b/examples-python/01SparkRDD/spark-hw04-reduce.py @@ -0,0 +1,9 @@ + +from pyspark import SparkContext +sc = SparkContext("local", "Simple App") +textFile = sc.textFile("hdfs:/spark/hw/test.txt") +numBs = textFile.map(lambda s: s.split(','))\ + .map(lambda w:w[0])\ + .reduce(lambda a, b:a + b) +print(numBs) + diff --git a/examples-python/01SparkRDD/spark-hw05-reduceByKey.py b/examples-python/01SparkRDD/spark-hw05-reduceByKey.py new file mode 100644 index 0000000..fe2f54f --- /dev/null +++ b/examples-python/01SparkRDD/spark-hw05-reduceByKey.py @@ -0,0 +1,11 @@ + + +from operator import add +from pyspark import SparkContext +sc = SparkContext("local", "Simple App") +textFile = sc.textFile("hdfs:/spark/hw/test.txt") +numBs = textFile.flatMap(lambda s: s.split(','))\ + .map(lambda b: (b,1))\ + .reduceByKey(add)\ + .collect() +print(numBs) diff --git a/examples-python/README.md b/examples-python/README.md index 583287c..7320492 100644 --- a/examples-python/README.md +++ b/examples-python/README.md @@ -1,2 +1,23 @@ # Spark examples for Python 這邊將針對使用 Python 來進行 Spark 巨量資料處理的應用程式開發。 + + + + +### 上傳檔案到 HDFS +Spark API Example 採用以下測試資料來完成操作,可以透過 vim 或 nano 新增: +```txt +$ vim test.txt +# test data +a,123,456,789,11344,2142,123 +b,1234,124,1234,123,123 +c,123,4123,5435,1231,5345 +d,123,456,789,113,2142,143 +e,123,446,789,14,2142,113 +f,123,446,789,14,2142,1113,323 +``` +新增完成後,上傳至 HDFS 或者 OpenStack Swift 上,以下為 HDFS 範例: +```sh +$ hadoop fs -mkdir -p /spark/hw +$ hadoop fs -put test.txt /spark/hw +``` \ No newline at end of file