code to train classifiers

sabrina-hoppe · May 5, 2018 · 34ff610 · 34ff610
1 parent 0403f2c
commit 34ff610
Show file tree

Hide file tree

Showing 7 changed files with 438 additions and 0 deletions.
diff --git a/01_train_classifiers.sh b/01_train_classifiers.sh
@@ -0,0 +1,18 @@
+n_parallel_jobs=6 # number of jobs that will be run in parallel
+n_total_jobs=100
+for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
+do
+	# the for loop below will start n_parallel_jobs for each trait
+	# unless n_total_jobs will be reached first (this is checked in the if statementc)
+	mi=`expr "$li" + "$n_parallel_jobs"`
+	if [ "$mi" -gt "$n_total_jobs" ]
+	then
+		mi=$n_total_jobs
+	fi
+
+	for t in $(seq 0 6);
+	do
+		python2.7 -m classifiers.train_classifier -t $t -s 0 -a 0 -l $li -m $mi
+		wait
+	done
+done
diff --git a/02_train_specialized_classifiers.sh b/02_train_specialized_classifiers.sh
@@ -0,0 +1,20 @@
+n_parallel_jobs=6 # number of jobs that will be run in parallel
+n_total_jobs=100
+for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
+do
+	# the for loop will start n_parallel_jobs for each trait
+	# unless n_total_jobs will be reached first (this is checked in the if statementc)
+	mi=`expr "$li" + "$n_parallel_jobs"`
+	if [ "$mi" -gt "$n_total_jobs" ]
+	then
+		mi=$n_total_jobs
+	fi
+
+	for t in $(seq 0 6);
+	do
+		for a in 1 2; do
+			python2.7 -m classifiers.train_classifier -t $t -s 0 -a $a -l $li -m $mi
+			wait
+		done
+	done
+done
diff --git a/03_train_baseline.py b/03_train_baseline.py
@@ -0,0 +1,99 @@
+import sys
+import numpy as np
+from config import onf
+import getopt
+from sklearn.cross_validation import LabelKFold as LKF
+from sklearn.cross_validation import StratifiedKFold as SKF
+from sklearn.metrics import f1_score, accuracy_score
+import pandas as pns
+
+def load_data(ws, t):
+	_, y_file, id_file = conf.get_merged_feature_files(ws)
+	y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
+	ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0]
+	return y_ws, ids_ws
+
+def get_baseline_f1_score(t):
+	"""
+	train a baseline classifier and return the F1 score it achieves
+	"""
+	outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True)
+
+	preds = np.zeros((conf.n_participants), dtype=int)
+	truth = np.zeros((conf.n_participants), dtype=int)
+
+	for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv):
+		inner_performance = np.zeros((conf.n_inner_folds, len(conf.all_window_sizes)))
+
+		for ws_i in xrange(0, len(conf.all_window_sizes)):
+			ws = conf.all_window_sizes[ws_i]
+
+			# load data for this window size
+			y_ws, ids_ws = load_data(ws, t)
+
+			# cut out the outer train samples
+			outer_train_samples = np.array([p in outer_train_participants for p in ids_ws])
+			outer_train_y = y_ws[outer_train_samples]
+			outer_train_y_ids = ids_ws[outer_train_samples]
+
+			# build inner cross validation such that all samples of one person are either in training or testing
+			inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds)
+			for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv):
+				# create inner train and test samples. Note: both are taken from outer train samples!
+				inner_y_train = outer_train_y[inner_train_indices]
+				unique_inner_test_ids = np.unique(outer_train_y_ids[inner_test_indices])
+
+				# predict the most frequent class from the training set
+				hist,_ = np.histogram(inner_y_train, bins=[0.5,1.5,2.5,3.5])
+				guess = np.argmax(hist) + 1
+				innerpreds = np.full(len(unique_inner_test_ids), guess, dtype=int)
+				innertruth = participant_scores[unique_inner_test_ids]
+
+				inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds))
+
+		# evaluate classifier on outer cv using the best window size from inner cv
+		chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0))
+		chosen_ws = conf.all_window_sizes[chosen_ws_i]
+		y, ids = load_data(chosen_ws, t)
+
+		outer_train_samples = np.array([p in outer_train_participants for p in ids])
+		outer_test_samples = np.array([p in outer_test_participants for p in ids])
+
+		if outer_train_samples.size > 0 and outer_test_samples.size > 0:
+			y_train = y[outer_train_samples]
+
+			# guess the most frequent class
+			hist,_ = np.histogram(y_train, bins=[0.5, 1.5, 2.5, 3.5])
+			guess = np.argmax(hist) + 1
+
+			for testp in outer_test_participants:
+				if testp in ids[outer_test_samples]:
+					preds[testp] = guess
+					truth[testp] = participant_scores[testp]
+				else:
+					# participant does not occour in outer test set, e.g. because their time in the shop was too short
+					preds[testp] = -1
+					truth[testp] = -1
+					print 'not enough samples for participant', testp
+			#print 'preds collected'
+		else:
+			for testp in outer_test_participants:
+				preds[testp] = np.array([])
+				truth[testp] = -1
+
+	f1 = f1_score(truth, preds, average='macro')
+	return f1
+
+# If the program is run directly:
+if __name__ == "__main__":
+	df = []
+	for trait in xrange(0, conf.n_traits):
+		participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(trait+1,))
+		print conf.medium_traitlabels[trait]
+		for si in xrange(0,conf.max_n_iter):
+			f1 = get_baseline_f1_score(trait)
+			print '\t'+str(si)+':', f1
+			df.append([f1, conf.medium_traitlabels[trait], si])
+	df_pns = pns.DataFrame(data=df, columns=['F1', 'trait', 'iteration'])
+	df_pns.to_csv(conf.result_folder + '/most_frequ_class_baseline.csv')
+	print conf.result_folder + '/most_frequ_class_baseline.csv written.'
diff --git a/04_label_permutation_test.sh b/04_label_permutation_test.sh
@@ -0,0 +1,18 @@
+n_parallel_jobs=6 # number of jobs that will be run in parallel
+n_total_jobs=100
+for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
+do
+	# the for loop below will start n_parallel_jobs for each trait
+	# unless n_total_jobs will be reached first (this is checked in the if statementc)
+	mi=`expr "$li" + "$n_parallel_jobs"`
+	if [ "$mi" -gt "$n_total_jobs" ]
+	then
+		mi=$n_total_jobs
+	fi
+
+	for t in $(seq 0 6);
+	do
+		python2.7 -m classifiers.train_classifier -t $t -s 1 -a 0 -l $li -m $mi
+		wait
+	done
+done
diff --git a/README.md b/README.md
@@ -25,7 +25,25 @@ reproducing the paper results step by step:
 1. __Extract features from raw gaze data__:    
    `python 00_compute_features.py` to compute gaze features for all participants  
    Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds.  
+2. __Train random forest classifiers__  
+     `./01 train_classifiers.sh` to reproduce the evaluation setting described in the paper in which each classifier was trained 100 times.  
+    `./02_train_specialized_classifiers.sh` to train specialized classifiers on parts of the data (specifically on data from inside the shop or on the way).
 
+    If the scripts cannot be executed, you might not have the right access permissions to do so. On Linux, you can try `chmod +x 01_train_classifiers.sh`,`chmod +x 02_train_specialized_classifiers.sh` and `chmod +x 03_label_permutation_test.sh` (see below for when/how to use the last script).
+
+    In case you want to call the script differently, e.g. to speed-up the computation or try with different parameters, you can pass the following arguments to `classifiers.train_classifier`:  
+      `-t` 	trait index between 0 and 6  
+      `-l`   lowest number of repetitions, e.g. 0   
+      `-m`   max number of repetitions, e.g. 100  
+      `-a`   using partial data only: 0 (all data), 1 (way data), 2(shop data)  
+
+    In case of performance issues, it might be useful to check `_conf.py` and change `max_n_jobs` to restrict the number of jobs (i.e. threads) running in parallel.
+
+    The results will be saved in `results/A0` for all data, `results/A1` for way data only and `results/A2` for data inside a shop. Each file is named `TTT_XXX.npz`, where TTT is the abbreviation of the personality trait (`O`,`C`,`E`,`A`,`N` for the Big Five and `CEI` or `PCS` for the two curiosity measures). XXX enumerates the classifiers (remember that we always train 100 classifiers for evaluation because there is some randomness involved in the training process).  
+
+3. __Evaluate Baselines__
+   * To train a classifier that always predicts the most frequent personality score range from its current training set, please execute `python 03_train_baseline.py`  
+   * To train classifiers on permuted labels, i.e. perform the so-called label permutation test, please execute `./04_label_permutation_test.sh`    
 
 
 ## Citation  

diff --git a/classifiers/__init__.py b/classifiers/__init__.py