diff --git a/01_train_classifiers.sh b/01_train_classifiers.sh new file mode 100644 index 0000000..6df47a2 --- /dev/null +++ b/01_train_classifiers.sh @@ -0,0 +1,18 @@ +n_parallel_jobs=6 # number of jobs that will be run in parallel +n_total_jobs=100 +for li in $(seq 0 $n_parallel_jobs $n_total_jobs); +do + # the for loop below will start n_parallel_jobs for each trait + # unless n_total_jobs will be reached first (this is checked in the if statementc) + mi=`expr "$li" + "$n_parallel_jobs"` + if [ "$mi" -gt "$n_total_jobs" ] + then + mi=$n_total_jobs + fi + + for t in $(seq 0 6); + do + python2.7 -m classifiers.train_classifier -t $t -s 0 -a 0 -l $li -m $mi + wait + done +done diff --git a/02_train_specialized_classifiers.sh b/02_train_specialized_classifiers.sh new file mode 100644 index 0000000..5e82ddc --- /dev/null +++ b/02_train_specialized_classifiers.sh @@ -0,0 +1,20 @@ +n_parallel_jobs=6 # number of jobs that will be run in parallel +n_total_jobs=100 +for li in $(seq 0 $n_parallel_jobs $n_total_jobs); +do + # the for loop will start n_parallel_jobs for each trait + # unless n_total_jobs will be reached first (this is checked in the if statementc) + mi=`expr "$li" + "$n_parallel_jobs"` + if [ "$mi" -gt "$n_total_jobs" ] + then + mi=$n_total_jobs + fi + + for t in $(seq 0 6); + do + for a in 1 2; do + python2.7 -m classifiers.train_classifier -t $t -s 0 -a $a -l $li -m $mi + wait + done + done +done diff --git a/03_train_baseline.py b/03_train_baseline.py new file mode 100644 index 0000000..936d442 --- /dev/null +++ b/03_train_baseline.py @@ -0,0 +1,99 @@ +import sys +import numpy as np +from config import onf +import getopt +from sklearn.cross_validation import LabelKFold as LKF +from sklearn.cross_validation import StratifiedKFold as SKF +from sklearn.metrics import f1_score, accuracy_score +import pandas as pns + +def load_data(ws, t): + _, y_file, id_file = conf.get_merged_feature_files(ws) + y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t] + ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0] + return y_ws, ids_ws + +def get_baseline_f1_score(t): + """ + train a baseline classifier and return the F1 score it achieves + """ + outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True) + + preds = np.zeros((conf.n_participants), dtype=int) + truth = np.zeros((conf.n_participants), dtype=int) + + for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv): + inner_performance = np.zeros((conf.n_inner_folds, len(conf.all_window_sizes))) + + for ws_i in xrange(0, len(conf.all_window_sizes)): + ws = conf.all_window_sizes[ws_i] + + # load data for this window size + y_ws, ids_ws = load_data(ws, t) + + # cut out the outer train samples + outer_train_samples = np.array([p in outer_train_participants for p in ids_ws]) + outer_train_y = y_ws[outer_train_samples] + outer_train_y_ids = ids_ws[outer_train_samples] + + # build inner cross validation such that all samples of one person are either in training or testing + inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds) + for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv): + # create inner train and test samples. Note: both are taken from outer train samples! + inner_y_train = outer_train_y[inner_train_indices] + unique_inner_test_ids = np.unique(outer_train_y_ids[inner_test_indices]) + + # predict the most frequent class from the training set + hist,_ = np.histogram(inner_y_train, bins=[0.5,1.5,2.5,3.5]) + guess = np.argmax(hist) + 1 + innerpreds = np.full(len(unique_inner_test_ids), guess, dtype=int) + innertruth = participant_scores[unique_inner_test_ids] + + inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds)) + + # evaluate classifier on outer cv using the best window size from inner cv + chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0)) + chosen_ws = conf.all_window_sizes[chosen_ws_i] + y, ids = load_data(chosen_ws, t) + + outer_train_samples = np.array([p in outer_train_participants for p in ids]) + outer_test_samples = np.array([p in outer_test_participants for p in ids]) + + if outer_train_samples.size > 0 and outer_test_samples.size > 0: + y_train = y[outer_train_samples] + + # guess the most frequent class + hist,_ = np.histogram(y_train, bins=[0.5, 1.5, 2.5, 3.5]) + guess = np.argmax(hist) + 1 + + for testp in outer_test_participants: + if testp in ids[outer_test_samples]: + preds[testp] = guess + truth[testp] = participant_scores[testp] + else: + # participant does not occour in outer test set, e.g. because their time in the shop was too short + preds[testp] = -1 + truth[testp] = -1 + print 'not enough samples for participant', testp + #print 'preds collected' + else: + for testp in outer_test_participants: + preds[testp] = np.array([]) + truth[testp] = -1 + + f1 = f1_score(truth, preds, average='macro') + return f1 + +# If the program is run directly: +if __name__ == "__main__": + df = [] + for trait in xrange(0, conf.n_traits): + participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(trait+1,)) + print conf.medium_traitlabels[trait] + for si in xrange(0,conf.max_n_iter): + f1 = get_baseline_f1_score(trait) + print '\t'+str(si)+':', f1 + df.append([f1, conf.medium_traitlabels[trait], si]) + df_pns = pns.DataFrame(data=df, columns=['F1', 'trait', 'iteration']) + df_pns.to_csv(conf.result_folder + '/most_frequ_class_baseline.csv') + print conf.result_folder + '/most_frequ_class_baseline.csv written.' diff --git a/04_label_permutation_test.sh b/04_label_permutation_test.sh new file mode 100644 index 0000000..d658411 --- /dev/null +++ b/04_label_permutation_test.sh @@ -0,0 +1,18 @@ +n_parallel_jobs=6 # number of jobs that will be run in parallel +n_total_jobs=100 +for li in $(seq 0 $n_parallel_jobs $n_total_jobs); +do + # the for loop below will start n_parallel_jobs for each trait + # unless n_total_jobs will be reached first (this is checked in the if statementc) + mi=`expr "$li" + "$n_parallel_jobs"` + if [ "$mi" -gt "$n_total_jobs" ] + then + mi=$n_total_jobs + fi + + for t in $(seq 0 6); + do + python2.7 -m classifiers.train_classifier -t $t -s 1 -a 0 -l $li -m $mi + wait + done +done diff --git a/README.md b/README.md index 1f9b061..8f8ba0f 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,25 @@ reproducing the paper results step by step: 1. __Extract features from raw gaze data__: `python 00_compute_features.py` to compute gaze features for all participants Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds. +2. __Train random forest classifiers__ + `./01 train_classifiers.sh` to reproduce the evaluation setting described in the paper in which each classifier was trained 100 times. + `./02_train_specialized_classifiers.sh` to train specialized classifiers on parts of the data (specifically on data from inside the shop or on the way). + If the scripts cannot be executed, you might not have the right access permissions to do so. On Linux, you can try `chmod +x 01_train_classifiers.sh`,`chmod +x 02_train_specialized_classifiers.sh` and `chmod +x 03_label_permutation_test.sh` (see below for when/how to use the last script). + + In case you want to call the script differently, e.g. to speed-up the computation or try with different parameters, you can pass the following arguments to `classifiers.train_classifier`: + `-t` trait index between 0 and 6 + `-l` lowest number of repetitions, e.g. 0 + `-m` max number of repetitions, e.g. 100 + `-a` using partial data only: 0 (all data), 1 (way data), 2(shop data) + + In case of performance issues, it might be useful to check `_conf.py` and change `max_n_jobs` to restrict the number of jobs (i.e. threads) running in parallel. + + The results will be saved in `results/A0` for all data, `results/A1` for way data only and `results/A2` for data inside a shop. Each file is named `TTT_XXX.npz`, where TTT is the abbreviation of the personality trait (`O`,`C`,`E`,`A`,`N` for the Big Five and `CEI` or `PCS` for the two curiosity measures). XXX enumerates the classifiers (remember that we always train 100 classifiers for evaluation because there is some randomness involved in the training process). + +3. __Evaluate Baselines__ + * To train a classifier that always predicts the most frequent personality score range from its current training set, please execute `python 03_train_baseline.py` + * To train classifiers on permuted labels, i.e. perform the so-called label permutation test, please execute `./04_label_permutation_test.sh` ## Citation diff --git a/classifiers/__init__.py b/classifiers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/classifiers/train_classifier.py b/classifiers/train_classifier.py new file mode 100644 index 0000000..3ab1299 --- /dev/null +++ b/classifiers/train_classifier.py @@ -0,0 +1,265 @@ +import sys +import numpy as np +from config import conf +import os +import getopt +import threading +from sklearn.cross_validation import LabelKFold as LKF +from sklearn.cross_validation import StratifiedKFold as SKF +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import f1_score, accuracy_score + + +def predict_all(): + # add threads to a list, and wait for all of them in the end + threads = [] + + for trait in trait_list: + for si in xrange(low_repetitions, num_repetitions): + fname = conf.get_result_filename(annotation_value, trait, shuffle_labels, si, add_suffix=True) + if not os.path.exists(fname): + thread = threading.Thread(target=save_predictions, args=(trait, conf.get_result_filename(annotation_value, trait, shuffle_labels, si), si)) + sys.stdout.flush() + thread.start() + threads.append(thread) + else: + print "existing solution:", fname + + for thread in threads: + thread.join() + print 'waiting to join' + +def load_data(ws, annotation_value, t, chosen_features = None): + x_file, y_file, id_file = conf.get_merged_feature_files(ws) + if annotation_value == conf.annotation_all: + x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1) + y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t] + ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0] + elif annotation_value == conf.annotation_shop: + x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1) + y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t] + ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int) + + x_ws = x_ws[ids_ws[:,1] == conf.time_window_annotation_shop,:] + y_ws = y_ws[ids_ws[:,1] == conf.time_window_annotation_shop] + ids_ws = ids_ws[ids_ws[:,1] == conf.time_window_annotation_shop,0] + elif annotation_value == conf.annotation_ways: + x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1) + y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t] + ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int) + + x_ws = x_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII),:] + y_ws = y_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII)] + ids_ws = ids_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII),0] + else: + print 'unknown annotation value', annotation_value + print 'should be 0 (all data), 1 (way) or 2 (shop).' + sys.exit(1) + if chosen_features is not None: + x_ws = x_ws[:,chosen_features] + return x_ws, y_ws, ids_ws + + +def save_predictions(t, filename, rs): + """ + train a classifier and write results to file + """ + # create RandomForest classifier with parameters given in _conf.py + clf = RandomForestClassifier(random_state=rs, verbose=verbosity, class_weight='balanced', + n_estimators=conf.n_estimators, n_jobs=conf.max_n_jobs, max_features=conf.tree_max_features, + max_depth=conf.tree_max_depth) + + # create StandardScaler that will be used to scale each feature + # such that it has mean 0 and std 1 on the trianing set + scaler = StandardScaler(with_std=True, with_mean=True) + + # use ground truth to create folds for outer cross validation in a stratified way, i.e. such that + # each label occurs equally often + participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(t+1,)) + outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True) + + # initialise arrays to save information + feat_imp = np.zeros((len(outer_cv), conf.max_n_feat)) # feature importance + preds = np.zeros((conf.n_participants), dtype=int) # predictions on participant level + detailed_preds = np.zeros((conf.n_participants), dtype=object) # predictions on time window level, array of lists + chosen_ws_is = np.zeros((conf.n_participants), dtype=int) # indices of window sizes chosen in the inner cross validation + + for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv): + print + print str(outer_i + 1) + '/' + str(conf.n_outer_folds) + + # find best window size in inner cv, and discard unimportant features + inner_performance = np.zeros((conf.n_inner_folds, len(all_window_sizes))) + inner_feat_importances = np.zeros((conf.max_n_feat, len(all_window_sizes))) + + for ws_i in xrange(0, len(all_window_sizes)): + ws = all_window_sizes[ws_i] + print '\t', 'ws ' + str(ws_i + 1) + '/' + str(len(all_window_sizes)) + + # load data for this window size + x_ws, y_ws, ids_ws = load_data(ws, annotation_value, t) + if shuffle_labels: + np.random.seed(316588 + 111 * t + rs) + perm = np.random.permutation(len(y_ws)) + y_ws = y_ws[perm] + ids_ws = ids_ws[perm] + + # cut out the outer train samples + outer_train_samples = np.array([p in outer_train_participants for p in ids_ws]) + outer_train_x = x_ws[outer_train_samples, :] + outer_train_y = y_ws[outer_train_samples] + outer_train_y_ids = ids_ws[outer_train_samples] + + # build inner cross validation such that all samples of one person are either in training or testing + inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds) + for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv): + # create inner train and test samples. Note: both are taken from outer train samples! + inner_x_train = outer_train_x[inner_train_indices, :] + inner_y_train = outer_train_y[inner_train_indices] + + inner_x_test = outer_train_x[inner_test_indices, :] + inner_y_test = outer_train_y[inner_test_indices] + + # fit scaler on train set and scale both train and test set with the result + scaler.fit(inner_x_train) + inner_x_train = scaler.transform(inner_x_train) + inner_x_test = scaler.transform(inner_x_test) + + # fit Random Forest + clf.fit(inner_x_train, inner_y_train) + + # save predictions and feature importance + inner_pred = clf.predict(inner_x_test) + inner_feat_importances[:, ws_i] += clf.feature_importances_ + + # compute and save performance in terms of accuracy + innerpreds = [] + innertruth = [] + inner_test_ids = outer_train_y_ids[inner_test_indices] + for testp in np.unique(inner_test_ids): + (values, counts) = np.unique(inner_pred[inner_test_ids == testp], return_counts=True) + ind = np.argmax(counts) + innerpreds.append(values[ind]) + innertruth.append(inner_y_test[inner_test_ids == testp][0]) + inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds)) + print ' ACC: ', '%.2f' % (inner_performance[inner_i, ws_i] * 100) + + # evaluate classifier on outer cv using the best window size from inner cv, and the most informative features + chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0)) + chosen_ws = all_window_sizes[chosen_ws_i] + chosen_features = (inner_feat_importances[:,chosen_ws_i]/float(conf.n_inner_folds)) > 0.005 + + # reload all data + x, y, ids = load_data(chosen_ws, annotation_value, t, chosen_features=chosen_features) + if shuffle_labels: + np.random.seed(316588 + 111 * t + rs + 435786) + perm = np.random.permutation(len(y)) + y = y[perm] + ids = ids[perm] + + outer_train_samples = np.array([p in outer_train_participants for p in ids]) + outer_test_samples = np.array([p in outer_test_participants for p in ids]) + + if outer_train_samples.size > 0 and outer_test_samples.size > 0: + x_train = x[outer_train_samples, :] + y_train = y[outer_train_samples] + + x_test = x[outer_test_samples, :] + y_test = y[outer_test_samples] + + # scaling + scaler.fit(x_train) + x_train = scaler.transform(x_train) + x_test = scaler.transform(x_test) + + # fit Random Forest + clf.fit(x_train, y_train) + pred = clf.predict(x_test) + + for testp in outer_test_participants: + chosen_ws_is[testp] = chosen_ws_i + if testp in ids[outer_test_samples]: + # majority voting over all samples that belong to participant testp + (values, counts) = np.unique(pred[ids[outer_test_samples] == testp], return_counts=True) + ind = np.argmax(counts) + preds[testp] = values[ind] + detailed_preds[testp] = list(pred[ids[outer_test_samples] == testp]) + else: + # participant does not occour in outer test set, e.g. because their time in the shop was too short + preds[testp] = -1 + detailed_preds[testp] = [] + + # save the resulting feature importance + feat_imp[outer_i, chosen_features] = clf.feature_importances_ + + else: + for testp in outer_test_participants: + chosen_ws_is[testp] = -1 + preds[testp] = np.array([]) + truth[testp] = -1 + feat_imp[outer_i, chosen_features] = -1 + + # compute resulting F1 score and save to file + nonzero_preds = preds[preds>0] + nonzero_truth = participant_scores[preds>0] + f1 = f1_score(nonzero_truth, nonzero_preds, average='macro') + np.savez(filename, f1=f1, predictions=preds, chosen_window_indices=chosen_ws_is, + feature_importances=feat_imp, detailed_predictions=detailed_preds) + print f1, 'written', filename + +# If the program is run directly: +if __name__ == "__main__": + try: + opts, args = getopt.getopt(sys.argv[1:], "t:m:l:s:a:", []) + except getopt.GetoptError: + print 'valid arguments:' + print '-t trait index' + print '-s 1 to perform label permutation test, do not pass s or use -s 0 otherwise' + print '-l lowest number of repetitions' + print '-m max number of repetitions' + print '-a using partial data only: 0 (all data), 1 (way data), 2(shop data)' + sys.exit(2) + + + low_repetitions = 0 + num_repetitions = conf.max_n_iter + verbosity = 0 + shuffle_labels = False + annotation_value = conf.annotation_all + trait_list = xrange(0, conf.n_traits) + + for opt, arg in opts: + if opt == '-t': + t = int(arg) + assert t in trait_list + trait_list = [t] + elif opt == '-a': + annotation_value = int(arg) + assert annotation_value in conf.annotation_values + elif opt == '-s': + shuffle_labels = bool(int(arg)) + elif opt == '-m': + num_repetitions = int(arg) + elif opt == '-l': + low_repetitions = int(arg) + else: + print 'valid arguments:' + print '-t trait index' + print '-s 1 to perform label permutation test, do not pass s or use -s 0 otherwise' + print '-l lowest number of repetitions' + print '-m max number of repetitions' + print '-a using partial data only: 0 (all data), 1 (way data), 2(shop data)' + sys.exit(2) + + result_folder = conf.get_result_folder(annotation_value) + if not os.path.exists(result_folder): + os.makedirs(result_folder) + + # restrict window sizes in case shop data should be used + if annotation_value == conf.annotation_shop: + all_window_sizes = conf.all_shop_window_sizes + else: + all_window_sizes = conf.all_window_sizes + + predict_all()