naep/load_data.py at master · pnb/naep · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
import numpy as np


df_cache = {}


def _preprocess(df):
    df['time_unix'] = pd.to_datetime(df.EventTime).astype(np.int64) // 10 ** 6
    # First Text Change basically only happens in one item, and is redundant with the action before
    # print(df[df.Observable == 'First Text Change'].AccessionNumber.value_counts())
    df = df[df.Observable != 'First Text Change']  # Hence, remove it
    df = df[df.Observable != 'Exit Item']  # Basically 100% redundant with "Enter Item"
    # "Calculator Buffer" is a better indicator of closing the calculator, because sometimes it gets
    # automatically closed if the student opens a new scratch area or switches problems
    df = df[df.Observable != 'Close Calculator']
    df = df[df.EventTime.notnull()]  # Two null rows causing some outliers in delta time
    df = df.loc[(df.shift(1) != df).any(axis=1)]  # Remove consecutive duplicate rows (keep first)
    df['delta_time_ms'] = 0
    for pid, pid_df in df.groupby('STUDENTID'):
        df.loc[pid_df.index, 'delta_time_ms'] = \
            (pid_df.time_unix.shift(-1) - pid_df.time_unix).fillna(0)
    df['EventTime'] = pd.to_datetime(df.EventTime)
    return df


def train_full():
    if 'train_full' not in df_cache:
        df = pd.read_csv('public_data/data_a_train.csv')
        label_df = pd.read_csv('public_data/data_train_label.csv')
        assert len(df.STUDENTID.unique()) == len(label_df.STUDENTID.unique())
        gt = {p: int(l) for p, l in label_df.values}
        df['label'] = [gt[p] for p in df.STUDENTID.values]
        df_cache['train_full'] = _preprocess(df)
    return df_cache['train_full'].copy()


def train_10m():
    # Return only first 10 minutes of data per participant
    if 'train_10m' not in df_cache:
        df = train_full()
        start_unix_map = {p: v.time_unix.min() for p, v in df.groupby('STUDENTID')}
        df['start_unix'] = [start_unix_map[p] for p in df.STUDENTID]
        df_cache['train_10m'] = df[df.time_unix < df.start_unix + 10 * 60 * 1000] \
            .drop(columns='start_unix')
    return df_cache['train_10m'].copy()


def train_20m():
    # Return only first 20 minutes of data per participant
    if 'train_20m' not in df_cache:
        df = train_full()
        start_unix_map = {p: v.time_unix.min() for p, v in df.groupby('STUDENTID')}
        df['start_unix'] = [start_unix_map[p] for p in df.STUDENTID]
        df_cache['train_20m'] = df[df.time_unix < df.start_unix + 20 * 60 * 1000] \
            .drop(columns='start_unix')
    return df_cache['train_20m'].copy()


def holdout_10m():
    if 'holdout_10m' not in df_cache:
        df_cache['holdout_10m'] = _preprocess(pd.read_csv('public_data/data_a_hidden_10.csv'))
    return df_cache['holdout_10m'].copy()


def holdout_20m():
    if 'holdout_20m' not in df_cache:
        df_cache['holdout_20m'] = _preprocess(pd.read_csv('public_data/data_a_hidden_20.csv'))
    return df_cache['holdout_20m'].copy()


def holdout_30m():
    if 'holdout_30m' not in df_cache:
        df_cache['holdout_30m'] = _preprocess(pd.read_csv('public_data/data_a_hidden_30.csv'))
    return df_cache['holdout_30m'].copy()


def all_unique_rows():
    # Return all data, including training and holdout data, but not including overlapping subsets of
    # the training data (10m, 20m)
    return pd.concat([train_full(), holdout_10m(), holdout_20m(), holdout_30m()],
                     ignore_index=True, sort=False)


def all_full_rows():
    # Return all data from 30m sessions in training and holdout data
    return pd.concat([train_full(), holdout_30m()], ignore_index=True, sort=False)