diff --git a/README.md b/README.md index b0367eb..d39aa63 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,26 @@ -## Hiring Challenge for Data Scientists + -### Introduction +**ScoreUp** is a simple scoring model where you may use to score customer 'health' based on their previous purchases. It takes a dataset file path as input and the dataset must include 3 arguments in order: -Two of our "clients" have a similar problem: they each have data on which of their customers made purchases at which times, and they each want to assign a value to each of their customers (there could be different commercial reasons for this -- for instance you may want to use the information to target an email campaign at customers that are at risk of never purchasing again, or simply to quantify customer health over time). -Your task is to build a solution that can take either client's data, and returns a "health" score for each customer. +1. date_col: Name of the column for purchase dates in any format. +2. id_col: Name of the column for customer id. +3. price_col: Name of the column for the purchase amount. -### Instructions +ScoreUp prepare/preprocess dataset, splits it into training and test sets and then uses ordinary linear regression from scikit-learn library. -Build a standalone command-line application in the language of your choice that takes the path of a single input CSV datafile as a command-line argument and - - loads and validates the input dataset of customer transaction data - - trains a model that predicts a customer's health as a float from `0.0 - 1.0` given their transaction history - - prints a CSV file containing the customer ID and health score per row to `stdout` +**Future Improvements:** -The `orders.zip` archive contains two sample transaction datasets that can each be used as input to generate customer predictions. The files come from two different domains and are independent, with their own schema and consist of "messy" real world data - your solution is expected to be able to work with each sample dataset individually to output predictions. +- More data can be provided in addition to purchase data and model can be improved in the light of the new data. -You can use any 1st- or 3rd-party existing library functions, packages, frameworks, models, and solvers you like, or can build a solution/model from scratch if you prefer. +- Data processing functions are written based on 2 example input datasets. More general rules can be followed to provide the flexibility of working with any dataset. + +- More independent modules can be added so that: + - a model can be trained with a training set then downloaded to local disk + - there would be ready-to-use models to make score predictions + - a downloaded model can be read and used as a model to make score predictions + +**How to Run:** + +python get_scores.py -f -d -id -p -There is no right answer as such, we will mainly be looking at code quality, data preprocessing skills, completeness of the solution from a software engineering perspective, and clarity of thought. -To get started, we recommend forking and cloning this repo, and then either point us to your fork or submit a PR - thanks! (Note, you'll need `git-lfs` installed to pull down the datasets, or just download direct from the [GitHub source browser](https://github.com/nstack/hiring-ds/blob/master/orders.zip)) diff --git a/Score UP.png b/Score UP.png new file mode 100644 index 0000000..1b6536d Binary files /dev/null and b/Score UP.png differ diff --git a/get_scores.py b/get_scores.py new file mode 100644 index 0000000..1bcaa3e --- /dev/null +++ b/get_scores.py @@ -0,0 +1,69 @@ + +import argparse +import os +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,accuracy_score + +from utils import * + +def main(args): + + assert os.path.isfile(args.filepath),'File path not found... must provide a valid file path...' + + print('Preprocessing data...') + + processed_df = prepare_data(args.filepath, args.date_col, args.id_col, args.price_col) + + print('Splitting data into training and test sets...') + + X_train, X_test, y_train, y_test = split_data(processed_df, args.date_col, args.id_col, args.price_col) + + print('Training set size : ', len(X_train)) + print('Test set size : ', len(X_test)) + + y_actual = y_test + + model = LinearRegression() + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + + print('Mean Absolute Error: ', mean_absolute_error(y_actual,y_pred)) + print('Mean Squared Error:', np.sqrt(mean_squared_error(y_actual,y_pred))) + print('R2 Score: ', r2_score(y_actual,y_pred)) + + y_pred = min_max_scaler(pd.DataFrame(y_pred)) + + ids = pd.DataFrame(X_test.index) + scores = pd.concat([ids, y_pred], axis=1, join_axes=[y_pred.index]) + scores.columns = ['id','score'] + with pd.option_context('display.max_rows', None, 'display.max_columns', 3): + print(scores) + + scores.to_csv('score_report.csv') + print('Score report is saved to local disk.') + +if __name__=="__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '-f', '--filepath', + help="Dataset file path to use as input; example: your_file_path.csv", + required=True, type=str) + parser.add_argument( + '-d', '--date_col', + help="Date column name in dataset file", + required=True, type=str) + parser.add_argument( + '-id', '--id_col', + help="Customer ID column name in dataset file", + required=True, type=str) + parser.add_argument( + '-p', '--price_col', + help="Price column name in dataset file", + required=True, type=str) + + args = parser.parse_args() + + main(args) + + + diff --git a/orders.zip b/orders.zip deleted file mode 100644 index 6c30209..0000000 --- a/orders.zip +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ecaddc9abc1caab60674492bc35d5af9ceb32da09b073873f68ec6611cb050f2 -size 13171540 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..56b9c64 --- /dev/null +++ b/utils.py @@ -0,0 +1,74 @@ + +import pandas as pd +import numpy as np +import datetime as dt +from sklearn.preprocessing import MinMaxScaler, Imputer, StandardScaler +from sklearn.model_selection import train_test_split + +def prepare_data(file_dir, date_col, id_col, price_col): + df = pd.read_csv(file_dir, date_parser=True, usecols=[date_col, id_col, price_col]) + + df[price_col] = df[price_col].astype(str) + df[price_col] = df[price_col].str.replace(',','') + + df.replace([-1, "null","nan","NaN",'NaT', 'nat'], np.nan, inplace = True) + + df[price_col] = pd.to_numeric(df[price_col]) + negative_values = df[price_col] <= 0.0 + df.loc[negative_values, price_col] = np.nan + + mean_imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0) + df[price_col] = mean_imputer.fit_transform(np.array(df[price_col]).reshape(-1,1)) + + df.dropna(subset=[id_col, date_col], inplace=True) + df[date_col] = pd.to_datetime(df[date_col]) + + return df + + +def pivot_df_feature(df, date_col, id_col, price_col): + df_pivot_1 = pd.pivot_table(df, values=price_col, index=[id_col], columns=df[date_col].dt.month , aggfunc=np.sum) + df_pivot_1 = df_pivot_1.add_suffix('_price') + + df_pivot_2 = pd.pivot_table(df, values=price_col, index=[id_col], columns=df[date_col].dt.month , aggfunc=np.count_nonzero) + df_pivot_2 = df_pivot_2.add_suffix('_count') + + df_pivot = pd.concat([df_pivot_1,df_pivot_2], axis=1, join_axes=[df_pivot_1.index]) + df_pivot.fillna(0, inplace=True) + + return df_pivot + +def pivot_df_target(df, date_col, id_col, price_col): + df_pivot = pd.pivot_table(df, values=price_col, index=[id_col], columns=df[date_col].dt.month , aggfunc=np.sum) + df_pivot = df_pivot.add_suffix('_price') + + df_pivot.fillna(0, inplace=True) + + return df_pivot + +def split_data(df, date_col, id_col, price_col): + df = df.sort_values(by=date_col) + #getting the last month as our target data + target_data = df[df[date_col].dt.strftime("%m-%y")==df[date_col].iloc[-1].strftime("%m-%y")] + feature_data = df[df[date_col].dt.strftime("%m-%y")!=df[date_col].iloc[-1].strftime("%m-%y")] + + #pivoting target and feature data + target_data = pivot_df_target(target_data, date_col, id_col, price_col) + feature_data = pivot_df_feature(feature_data, date_col, id_col, price_col) + + #get mutual customers and split data into train and test + idx = feature_data.index.intersection(target_data.index) + target_data = (target_data.loc[idx]).sort_index() + feature_data = (feature_data.loc[idx]).sort_index() + + X_train, X_test, y_train, y_test = train_test_split(feature_data, target_data, test_size=0.30) + + return X_train, X_test, y_train, y_test + +def min_max_scaler(x): + float_array = x.values.astype(float) + min_max_scaler = MinMaxScaler() + scaled = pd.DataFrame(min_max_scaler.fit_transform(float_array.reshape(-1,1))) + + return scaled +