-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathPython intro(1).json
More file actions
1 lines (1 loc) · 15.8 KB
/
Python intro(1).json
File metadata and controls
1 lines (1 loc) · 15.8 KB
1
{"paragraphs":[{"text":"import pandas as pd","user":"anonymous","dateUpdated":"2018-03-09T04:48:42+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1520116160393_-844476957","id":"20180303-222920_1479131283","dateCreated":"2018-03-03T22:29:20+0000","dateStarted":"2018-03-09T04:48:42+0000","dateFinished":"2018-03-09T04:48:43+0000","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:282"},{"text":"# create series from list\ncities = [\"San Diego\", \"Los Angeles\", \"San Francisco\", \"Seattle\", \"Austin\", \"Chicago\", \"Boston\"]\ns = pd.Series(cities)\nprint(s.values)\nprint(s.index)","user":"anonymous","dateUpdated":"2018-03-07T04:34:43+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"['San Diego' 'Los Angeles' 'San Francisco' 'Seattle' 'Austin' 'Chicago'\n 'Boston']\nRangeIndex(start=0, stop=7, step=1)\n"}]},"apps":[],"jobName":"paragraph_1520118069747_238179841","id":"20180303-230109_1966509905","dateCreated":"2018-03-03T23:01:09+0000","dateStarted":"2018-03-07T04:34:43+0000","dateFinished":"2018-03-07T04:34:43+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:283"},{"text":"# series of numbers\nsales = [150, 250.50, 10.50, 5.0]\nstores = [\"Downtown\", \"Uptown\", \"North\", \"South\"]\ns2 = pd.Series(sales, index=stores)\nprint(s2)\nprint(s2.sum())\nprint(s2.count(), s2.mean(), s2.median(), s2.std()) # ignores missing values\nprint(s2.min(), s2.idxmin()) # idxmin returns index position with min value\nprint(s2.max(), s2.idxmax()) # idxmax returns index position with max value","user":"anonymous","dateUpdated":"2018-03-07T04:34:49+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"Downtown 150.0\nUptown 250.5\nNorth 10.5\nSouth 5.0\ndtype: float64\n416.0\n4 104.0 80.25 118.492615804\n5.0 South\n250.5 Uptown\n"}]},"apps":[],"jobName":"paragraph_1520118355735_-910465836","id":"20180303-230555_1476509549","dateCreated":"2018-03-03T23:05:55+0000","dateStarted":"2018-03-07T04:34:49+0000","dateFinished":"2018-03-07T04:34:49+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:284"},{"text":"#############################################################\n# DataFrames\n#############################################################\n# read_json as dataframe (many more read options to check out)\n#file_json_array = 'https://s3-us-west-2.amazonaws.com/dvannoy-public/sample_data/vehicle_stops.json'\n#file_jsonl = 'https://s3-us-west-2.amazonaws.com/dvannoy-public/sample_data/vehicle_stops_newline_delimited.json'\nfile_jsonl = '/usr/zeppelin/notebook/data/vehicle_stops.jsonl'\n\ndf = pd.read_json(file_jsonl, lines=True)\nprint(\"Num rows %s, Num rows/cols %s\" % (len(df), df.shape))\n\nprint(df.head())\n","user":"anonymous","dateUpdated":"2018-03-09T04:48:48+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"Num rows 103051, Num rows/cols (103051, 15)\n arrested contraband_found obtained_consent property_seized sd_resident \\\n0 n NaN NaN NaN N \n1 n NaN NaN NaN Y \n2 N NaN NaN NaN N \n3 n NaN NaN NaN Y \n4 n NaN NaN NaN Y \n\n searched service_area stop_cause stop_date stop_id stop_time \\\n0 n 120 Moving Violation 2016-03-09 1330039 1:24 \n1 n Unknown Moving Violation 2016-03-09 1330062 1:35 \n2 N 120 Moving Violation 2016-03-09 1330047 1:35 \n3 n Unknown Moving Violation 2016-03-09 1330071 1:00 \n4 n 110 Moving Violation 2016-03-09 1330073 1:53 \n\n subject_age subject_race subject_sex timestamp \n0 67 A F 2016-03-09 01:24:00 \n1 59 W F 2016-03-09 01:35:00 \n2 50 H M 2016-03-09 01:35:00 \n3 60 B M 2016-03-09 01:00:00 \n4 50 W M 2016-03-09 01:53:00 \n"}]},"apps":[],"jobName":"paragraph_1520116024489_-1224424981","id":"20180303-222704_1866916877","dateCreated":"2018-03-03T22:27:04+0000","dateStarted":"2018-03-09T04:48:48+0000","dateFinished":"2018-03-09T04:48:51+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:285"},{"text":"#############################################################\n# Clean up data\n#############################################################\n\n# replace missing values\ndf[\"subject_age\"].fillna(0, inplace=True)\n\ndf[\"subject_age\"] = df[\"subject_age\"].replace(\"No Age\", 0).astype(\"int\")\n\ndf.sort_values(\"subject_age\", ascending=False)\ndf[\"subject_age\"].head()\n\n# modify column in place\ndf[\"subject_age\"].add(1) # or df[\"subject_age\"] + 1\ndf[\"subject_age\"].head()\n\n# add column (will overwrite if column already exists)\ndf[\"new_col\"] = df[\"subject_age\"] + 1","user":"anonymous","dateUpdated":"2018-03-07T04:36:46+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1520116030570_1066754719","id":"20180303-222710_969221165","dateCreated":"2018-03-03T22:27:10+0000","dateStarted":"2018-03-07T04:36:46+0000","dateFinished":"2018-03-07T04:36:46+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:286"},{"text":"#############################################################\n# Type conversions\n#############################################################\n\ndf[\"stop_dt\"] = pd.to_datetime(df[\"stop_date\"])\n\ndef convert_to_bool(item):\n if item == 'Y':\n return True\n elif item == 'N':\n return False\n return None\n\ndf[\"sd_resident\"] = df[\"sd_resident\"].apply(convert_to_bool).astype(\"bool\")","user":"anonymous","dateUpdated":"2018-03-07T04:36:58+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1520119868683_621563768","id":"20180303-233108_14654213","dateCreated":"2018-03-03T23:31:08+0000","dateStarted":"2018-03-07T04:36:58+0000","dateFinished":"2018-03-07T04:36:58+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:287"},{"text":"#############################################################\n# Filtering\n#############################################################\nfiltered_df = df[df[\"sd_resident\"] == False]\nprint(filtered_df.head())\n","user":"anonymous","dateUpdated":"2018-03-07T04:37:13+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":" arrested contraband_found obtained_consent property_seized sd_resident \\\n0 n NaN NaN NaN False \n2 N NaN NaN NaN False \n5 N NaN NaN NaN False \n7 N NaN NaN NaN False \n9 n NaN NaN NaN False \n\n searched service_area stop_cause stop_date stop_id stop_time \\\n0 n 120 Moving Violation 2016-03-09 1330039 1:24 \n2 N 120 Moving Violation 2016-03-09 1330047 1:35 \n5 N 310 Moving Violation 2016-03-09 1330096 2:29 \n7 N 110 Moving Violation 2016-03-09 1330173 7:20 \n9 n 310 Moving Violation 2016-03-09 1330097 7:42 \n\n subject_age subject_race subject_sex timestamp new_col \\\n0 67 A F 2016-03-09 01:24:00 68 \n2 50 H M 2016-03-09 01:35:00 51 \n5 53 W M 2016-03-09 02:29:00 54 \n7 30 O M 2016-03-09 07:20:00 31 \n9 49 W M 2016-03-09 07:42:00 50 \n\n stop_dt \n0 2016-03-09 \n2 2016-03-09 \n5 2016-03-09 \n7 2016-03-09 \n9 2016-03-09 \n"}]},"apps":[],"jobName":"paragraph_1520120016099_1714866901","id":"20180303-233336_1021284307","dateCreated":"2018-03-03T23:33:36+0000","dateStarted":"2018-03-07T04:37:13+0000","dateFinished":"2018-03-07T04:37:13+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:288"},{"text":"#############################################################\n# Set index\n#############################################################\n\n# Set Index with column from dataframe and lookup by index\n# note: could also undo that with .reset_index()\ndf.set_index(\"stop_id\", inplace = True)\n\n# lookup on index, faster if sorted\ndf.sort_index(inplace= True)\nprint(df.loc[1330071])\n\n# can still read by postion\nprint(df.iloc[3:5])df","user":"anonymous","dateUpdated":"2018-03-07T04:39:25+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"arrested n\ncontraband_found NaN\nobtained_consent NaN\nproperty_seized NaN\nsd_resident True\nsearched n\nservice_area Unknown\nstop_cause Moving Violation\nstop_date 2016-03-09\nstop_time 1:00\nsubject_age 60\nsubject_race B\nsubject_sex M\ntimestamp 2016-03-09 01:00:00\nnew_col 61\nstop_dt 2016-03-09 00:00:00\nName: 1330071, dtype: object\n arrested contraband_found obtained_consent property_seized \\\nstop_id \n192708 N NaN NaN NaN \n192709 N NaN NaN NaN \n\n sd_resident searched service_area stop_cause stop_date \\\nstop_id \n192708 False N 440 Moving Violation 2016-04-13 \n192709 False N 620 Moving Violation 2016-04-13 \n\n stop_time subject_age subject_race subject_sex timestamp \\\nstop_id \n192708 10:00 42 L M 2016-04-13 10:00:00 \n192709 10:20 38 W M 2016-04-13 10:20:00 \n\n new_col stop_dt \nstop_id \n192708 43 2016-04-13 \n192709 39 2016-04-13 \n"}]},"apps":[],"jobName":"paragraph_1520120695179_21339039","id":"20180303-234455_1137238211","dateCreated":"2018-03-03T23:44:55+0000","dateStarted":"2018-03-07T04:37:18+0000","dateFinished":"2018-03-07T04:37:18+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:289"},{"text":"df.to_json('/usr/zeppelin/notebook/data/output.jsonl', orient='records', lines=True)","user":"anonymous","dateUpdated":"2018-03-07T04:55:04+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1520120830793_215838093","id":"20180303-234710_292934755","dateCreated":"2018-03-03T23:47:10+0000","dateStarted":"2018-03-07T04:55:04+0000","dateFinished":"2018-03-07T04:55:06+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:290"},{"text":"# Load dataset\nurl = \"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\"\nnames = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']\ndataset = pd.read_csv(url, names=names)\n\n","user":"anonymous","dateUpdated":"2018-03-09T05:06:05+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1520397805088_-1316157460","id":"20180307-044325_1479791395","dateCreated":"2018-03-07T04:43:25+0000","dateStarted":"2018-03-09T05:05:57+0000","dateFinished":"2018-03-09T05:05:57+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:291"},{"text":"import matplotlib.pyplot as plt\ndataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)\nplt.show()","user":"anonymous","dateUpdated":"2018-03-09T05:07:53+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"Traceback (most recent call last):\n File \"/tmp/zeppelin_python-1208381256817230232.py\", line 271, in <module>\n exec(code, _zcUserQueryNameSpace)\n File \"<stdin>\", line 1, in <module>\nImportError: No module named 'matplotlib'\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n File \"/tmp/zeppelin_python-1208381256817230232.py\", line 283, in <module>\n raise Exception(traceback.format_exc())\nException: Traceback (most recent call last):\n File \"/tmp/zeppelin_python-1208381256817230232.py\", line 271, in <module>\n exec(code, _zcUserQueryNameSpace)\n File \"<stdin>\", line 1, in <module>\nImportError: No module named 'matplotlib'\n\n"}]},"apps":[],"jobName":"paragraph_1520400185335_1175662476","id":"20180307-052305_989108392","dateCreated":"2018-03-07T05:23:05+0000","dateStarted":"2018-03-09T05:07:53+0000","dateFinished":"2018-03-09T05:07:53+0000","status":"ERROR","progressUpdateIntervalMs":500,"$$hashKey":"object:292"},{"user":"anonymous","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"text","editOnDblClick":false},"editorMode":"ace/mode/text"},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1520572011020_-1988619365","id":"20180309-050651_346172273","dateCreated":"2018-03-09T05:06:51+0000","status":"READY","progressUpdateIntervalMs":500,"$$hashKey":"object:293"}],"name":"Python intro","id":"2D9G8BXZW","angularObjects":{"2D8RBCXE3:shared_process":[],"2D81W9R8S:shared_process":[],"2D86G39HX:shared_process":[],"2D8GJZM3A:shared_process":[],"2D9PZKNW4:shared_process":[],"2DA6392UF:shared_process":[],"2D969RCAD:shared_process":[],"2D9EZVMX4:shared_process":[],"2D9HMF8G3:shared_process":[]},"config":{"looknfeel":"default","personalizedMode":"false"},"info":{}}