Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
feature extraction code
  • Loading branch information
Sabrina Hoppe committed May 5, 2018
1 parent a20084f commit cebde1b
Show file tree
Hide file tree
Showing 9 changed files with 1,279 additions and 2 deletions.
103 changes: 103 additions & 0 deletions 00_compute_features.py
@@ -0,0 +1,103 @@
import numpy as np
import os
from config import conf as conf
from featureExtraction import gaze_analysis as ga
import threading
import getopt
import sys
from config import names as gs

def compute_sliding_window_features(participant, ws, gazeAnalysis_instance):
"""
calls the gazeAnalysis instance it was given, calls it to get features and saves those to file
"""
window_features, window_times = gazeAnalysis_instance.get_window_features(ws, conf.get_step_size(ws))
np.save(conf.get_window_features_file(participant, ws), window_features)
np.save(conf.get_window_times_file(participant, ws), window_times)

if __name__ == "__main__":
for p in xrange(0,conf.n_participants):
threads = [] # one thread per time window will be used and collected in this list

# create data folder, plus one subfolder for participant p
if not os.path.exists(conf.get_feature_folder(p)):
os.makedirs(conf.get_feature_folder(p))

# make sure all relevant raw data files exist in the right folder
gaze_file = conf.get_data_folder(p) + '/gaze_positions.csv'
pupil_diameter_file = conf.get_data_folder(p) + '/pupil_diameter.csv'
events_file = conf.get_data_folder(p) + '/events.csv'
assert os.path.exists(gaze_file) and os.path.exists(pupil_diameter_file) and os.path.exists(events_file)

# load relevant data
gaze = np.genfromtxt(gaze_file, delimiter=',', skip_header=1)
pupil_diameter = np.genfromtxt(pupil_diameter_file, delimiter=',', skip_header=1)
events = np.genfromtxt(events_file, delimiter=',', skip_header=1, dtype=str)

# create instance of gazeAnalysis class that will be used for feature extraction
# this already does some initial computation that will be useful for all window sizes:
extractor = ga.gazeAnalysis(gaze, conf.fixation_radius_threshold, conf.fixation_duration_threshold,
conf.saccade_min_velocity, conf.max_saccade_duration,
pupil_diameter=pupil_diameter, event_strings=events)

# compute sliding window features by creating one thread per window size
for window_size in conf.all_window_sizes:
if not os.path.exists(conf.get_window_features_file(p, window_size)):
thread = threading.Thread(target=compute_sliding_window_features, args=(p, window_size, extractor))
thread.start()
threads.append(thread)

for t in threads:
t.join()

print 'finished all features for participant', p

# Merge the features from all participants into three files per window_size:
# merged_features includes all features
# merged_traits contains the ground truth personality score ranges
# merged_ids contains the participant number and context (way, shop, half of the recording)

# load ground truth from info folder:
binned_personality = np.genfromtxt(conf.binned_personality_file, delimiter=',', skip_header=1)
trait_labels = np.loadtxt(conf.binned_personality_file, delimiter=',', dtype=str)[0,:]
annotation = np.genfromtxt(conf.annotation_path, delimiter=',', skip_header=1)

for window_size in conf.all_window_sizes:
print 'merging window size', window_size

windowfeats_subtask_all = []
windowfeats_subtask_ids = []
windowfeats_subtask_all_y = []

for p in xrange(0, conf.n_participants):
featfilename = conf.get_window_features_file(p, window_size)
timesfilename = conf.get_window_times_file(p, window_size)
if os.path.exists(featfilename) and os.path.exists(timesfilename):
data = np.load(featfilename).tolist()
windowfeats_subtask_all.extend(data)
windowfeats_subtask_all_y.extend([binned_personality[p, 1:]] * len(data))

times = np.load(timesfilename)[:, 2:]
ann = annotation[p,1:]

ids_annotation = np.zeros((len(data), 3), dtype=int) # person, way/shop, half
ids_annotation[:,0] = p
ids_annotation[(times[:,1] < ann[0]),1] = conf.time_window_annotation_wayI
ids_annotation[(times[:,0] > ann[0]) & (times[:,1] < ann[1]),1] = conf.time_window_annotation_shop
ids_annotation[(times[:,0] > ann[1]),1] = conf.time_window_annotation_wayII
ids_annotation[:(len(data)/2), 2] = conf.time_window_annotation_halfI
ids_annotation[(len(data)/2):, 2] = conf.time_window_annotation_halfII

windowfeats_subtask_ids.extend(ids_annotation.tolist())
else:
print 'did not find ', featfilename
sys.exit(1)

ids = np.array(windowfeats_subtask_ids)
x = np.array(windowfeats_subtask_all, dtype=float)
y = np.array(windowfeats_subtask_all_y)
f1, f2, f3 = conf.get_merged_feature_files(window_size)

np.savetxt(f1, x, delimiter=',', header=','.join(gs.full_long_label_list), comments='')
np.savetxt(f2, y, delimiter=',', header=','.join(trait_labels), comments='')
np.savetxt(f3, ids, delimiter=',', header='Participant ID', comments='')
11 changes: 9 additions & 2 deletions README.md
@@ -1,8 +1,7 @@
# Eye movements during everyday behavior predict personality traits
*Sabrina Hoppe, Tobias Loetscher, Stephanie Morey and Andreas Bulling*

This repository provides all data used for the publication [in Frontiers in Human Neuroscience](https://dx.doi.org/10.3389/fnhum.2018.00105).
Code is coming soon!
This repository provides all data and code used for the publication [in Frontiers in Human Neuroscience](https://dx.doi.org/10.3389/fnhum.2018.00105).

## Dataset
* Gaze data recorded at 60Hz from 42 participants is stored in `data/ParticipantXX`.
Expand All @@ -20,6 +19,14 @@ Code is coming soon!

* Timestamps indicating the times when participants entered and left the shop are given in `info/annotation.csv` in seconds.


## Code
reproducing the paper results step by step:
1. __Extract features from raw gaze data__:
`python compute_features.py` to compute gaze features for all participants
Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds.


## Citation
If you want to cite this project, please use the following Bibtex format:

Expand Down
1 change: 1 addition & 0 deletions __init__.py
@@ -0,0 +1 @@

Empty file added config/__init__.py
Empty file.
97 changes: 97 additions & 0 deletions config/conf.py
@@ -0,0 +1,97 @@
import numpy as np

# global parameters
n_participants = 42
n_traits = 7
max_n_feat = 207
max_n_iter = 100
all_window_sizes = [5, 15, 30, 45, 60, 75, 90, 105, 120, 135]
all_shop_window_sizes = [5, 15] # at least 3/4 of the people have a time window in these times

# cross validation paramters
n_inner_folds = 3
n_outer_folds = 5

# Random Forest Parameters
tree_max_features = 15
tree_max_depth = 5
n_estimators = 100
max_n_jobs = 5

# given a window size, determine step size correctly for even and odd numbers
def get_step_size(window_size):
step_size = window_size / 2.0
if step_size * 10 % 2 == 0:
step_size = int(step_size)
return step_size

# relative paths
data_folder = 'data'
info_folder = 'info'
feature_folder = 'features'
result_folder = 'results'
figure_folder = 'figures'
annotation_path = info_folder + '/annotation.csv'
binned_personality_file = info_folder + '/binned_personality.csv'
personality_sex_age_file = info_folder + '/personality_sex_age.csv'

# load the personality trait names from file and map them to abbreviations
traitlabels = np.loadtxt(binned_personality_file, delimiter=',', dtype=str)[0, 1:]
def get_abbr(s):
return ''.join(item[0] for item in s.split() if item[0].isupper())
medium_traitlabels = [get_abbr(s) if (" " in s) else s for s in traitlabels]
short_traitlabels = [''.join(item[0] for item in tl.split() if item[0].isupper()) for tl in traitlabels]


# dynamically create relative paths for result files to create
def get_result_folder(annotation_val):
return result_folder + '/A' + str(annotation_val)

def get_result_filename(annotation_val, trait, shuffle_labels, i, add_suffix=False):
filename = get_result_folder(annotation_val) + '/' + short_traitlabels[trait]
if shuffle_labels:
filename += '_rnd'
filename += '_' + str(i).zfill(3)
if add_suffix:
filename += '.npz'
return filename

def get_feature_folder(participant):
return feature_folder + '/Participant' + str(participant).zfill(2)

def get_merged_feature_files(window_size):
return feature_folder + '/merged_features_' + str(window_size) + '.csv', feature_folder + '/merged_traits_' + str(window_size) + '.csv', feature_folder + '/merged_ids_' + str(window_size) + '.csv'

def get_data_folder(participant):
return data_folder + '/Participant' + str(participant).zfill(2)

def get_window_times_file(participant, window_size):
return get_feature_folder(participant) + "/window_times_" + str(window_size) + '.npy'

def get_window_features_file(participant, window_size):
return get_feature_folder(participant) + "/window_features_" + str(window_size) + '.npy'

def get_overall_features_file(participant):
return get_feature_folder(participant) + "/overall_features.npy"


# parameters for fixation/saccade detection
fixation_radius_threshold = 0.025
fixation_duration_threshold = 0.1
saccade_min_velocity = 2
max_saccade_duration = 0.5

# annotation constants (as given as arguments to train_classifier, and as used for file names in result_folder)
annotation_all = 0
annotation_ways = 1
annotation_shop = 2
annotation_values = [annotation_all, annotation_ways, annotation_shop]

# annotations used in merged_ids_* files in the feature_folder
# column 1
time_window_annotation_wayI = 1
time_window_annotation_shop = 2
time_window_annotation_wayII = 3
# column 2
time_window_annotation_halfI = 1
time_window_annotation_halfII = 2
160 changes: 160 additions & 0 deletions config/names.py
@@ -0,0 +1,160 @@
fixations_list_labels = ['mean x', 'mean y',
'var x', 'var y',
't start', 't end',
'start index', 'end index',
'mean diameter', 'var diameter',
'mean successive angles', 'var successive angles'
]
fix_mean_x_i = 0
fix_mean_y_i = 1
fix_var_x_i = 2
fix_var_y_i = 3
fix_start_t_i = 4
fix_end_t_i = 5
fix_start_index_i = 6
fix_end_index_i = 7
fix_mean_diam_i = 8
fix_var_diam_i = 9
fix_mean_succ_angles = 10
fix_var_succ_angles = 11

saccades_list_labels = ['start x', 'start y',
'end x', 'end y',
'angle',
't start', 't end',
'start index', 'end index',
'mean diameter', 'var diameter',
'peak velocity', 'amplitude',
]

sacc_start_x_i = 0
sacc_start_y_i = 1
sacc_end_x_i = 2
sacc_end_y_i = 3
sacc_angle_i = 4
sacc_t_start_i = 5
sacc_t_end_i = 6
sacc_start_index_i = 7
sacc_end_index_i = 8
sacc_mean_diam_i = 9
sacc_var_diam_i = 10
sacc_peak_vel_i = 11
sacc_amplitude_i = 12

blink_list_labels = ['t start', 't end', 'start index', 'end index']

blink_start_t_i = 0
blink_end_ti_i = 1
blink_start_index_i = 2
blink_end_index_i = 3

event_feature_labels = ['fixation rate', 'saccade rate', # 0 1
'small sacc. rate', 'large sacc. rate', 'positive sacc. rate', 'negative sacc. rate', # 2 3 4 5
'ratio sacc - fix', # 6
'ratio small sacc', 'ratio large sacc', 'ratio right sacc', 'ratio left sacc', # 7 8 9 10
'mean sacc amplitude', 'var sacc amplitude', 'min sacc amplitude', 'max sacc amplitude', #11 12 13 14
'mean peak velocity', 'var peak velocity', 'min peak velocity', 'max peak velocity', # 15 16 17 18
'mean mean diameter sacc', 'var mean diameter sacc', 'mean var diameter sacc', # 19 20 21 22
'var var diameter sacc',
'mean fix duration', 'var fix duration', 'min fix duration', 'max fix duration', # 23 24 25 26
'dwelling time',
'mean mean subsequent angle', 'var mean subsequent angle', 'mean var subsequent angle', 'var var subsequent angle',
'mean var x', 'mean var y', 'var var x', 'var var y', # 27 28 29 30
'mean mean diameter fix', 'var mean diameter fix', 'mean var diameter fix', 'var var diameter fix', # 31 32 33 34
'mean blink duration', 'var blink duration', 'min blink duration', 'max blink duration', # 35 36 37 38
'blink rate' # 39
]

event_feature_labels_long = ['fixation rate', 'saccade rate', # 0 1
'small saccade rate', 'large saccade rate', 'positive saccade rate', 'negative saccade rate', # 2 3 4 5
'saccade:fixation ratio', # 6
'ratio of small saccades', 'ratio of large saccades', 'ratio of right saccades', 'ratio of left saccades', # 7 8 9 10
'mean saccade amplitude', 'var saccade amplitude', 'min saccade amplitude', 'max saccade amplitude', #11 12 13 14
'mean saccadic peak velocity', 'var saccadic peak velocity', 'min saccadic peak velocity', 'max saccadic peak velocity', # 15 16 17 18
'mean of the mean pupil diameter during saccades', 'var of the mean pupil diameter during saccades',
'mean of the var pupil diameter during saccades', 'var of the var pupil diameter during saccades', # 19 20 21 22
'mean fixation duration', 'var fixation duration', 'min fixation duration', 'max fixation duration', # 23 24 25 26
'dwelling time',
'mean of the mean of subsequent angles', 'var of the mean of subsequent angles',
'mean of the var of subsequent angles', 'var of the var of subsequent angles',
'mean of the var of x', 'mean of the var of y', 'var of the var of x', 'var of the var of y', # 27 28 29 30
'mean of the mean pupil diameter during fixations', 'var of the mean pupil diameter during fixations',
'mean of the var pupil diameter during fixations', 'var of the var pupil diameter during fixations', # 31 32 33 34
'mean blink duration', 'var blink duration', 'min blink duration', 'max blink duration', # 35 36 37 38
'blink rate' # 39
]

def get_wordbook_feature_labels(movement_abbreviation):
return [movement_abbreviation + s + ' WB' + str(n) for n in [1, 2, 3, 4] for s in ['>0', 'max', 'min', 'arg max', 'arg min', 'range', 'mean', 'var']]

def get_wordbook_feature_labels_long(movement_abbreviation):
return [s1 + str(n) + '-gram ' + movement_abbreviation + s2 for n in [1, 2, 3, 4]
for (s1, s2) in [('number of different ', ' movements'),
('max frequency ', ' movements'),
('min frequency ', ' movements'),
('most frequent ', ' movement'),
('least frequent ', ' movement'),
('range of frequencies of ', ' movements'),
('mean frequency of ', ' movements'),
('var frequency of ', ' movements')
]]

position_feature_labels = ['mean x', 'mean y', 'mean diameter',
'min x', 'min y', 'min diameter',
'max x', 'max y', 'max diameter',
'min-max x', 'min-max y', 'min-max diameter',
'std x', 'std y', 'std diameter',
'median x', 'median y', 'median diameter',
'1st quart x', '1st quart y', '1st quart diameter',
'3rd quart x', '3rd quart y', '3rd quart diameter',
'IQR x', 'IQR y', 'IQR diameter',
'mean abs diff x', 'mean abs diff y', 'mean abs diff diameter',
'mean diff x', 'mean diff y', 'mean diff diameter',
'mean subsequent angle'
]

position_feature_labels_long = ['mean x', 'mean y', 'mean pupil diameter',
'minimum x', 'minimum y', 'minimum pupil diameter',
'maximum x', 'maximum y', 'maximum pupil diameter',
'range x', 'range y', 'range pupil diameter',
'std x', 'std y', 'std pupil diameter',
'median x', 'median y', 'median pupil diameter',
'1st quartile x', '1st quartile y', '1st quartile pupil diameter',
'3rd quartile x', '3rd quartile y', '3rd quartile pupil diameter',
'inter quartile range x', 'inter quartile range y', 'inter quartile range pupil diameter',
'mean difference of subsequent x', 'mean difference of subsequent y', 'mean difference of subsequent pupil diameters',
'mean diff x', 'mean diff y', 'mean diff pupil diameter',
'mean subsequent angle'
]

heatmap_feature_labels = ['heatmap_'+str(i).zfill(2) for i in xrange(0, 64)]
heatmap_feature_labels_long = ['heatmap cell '+str(i).zfill(2) for i in xrange(0, 64)]

full_label_list = event_feature_labels + heatmap_feature_labels + position_feature_labels + \
get_wordbook_feature_labels('sacc.') + get_wordbook_feature_labels('SF')

full_long_label_list = event_feature_labels_long + heatmap_feature_labels_long + position_feature_labels_long + \
get_wordbook_feature_labels_long('sacc.') + get_wordbook_feature_labels_long('SF')


sacc_dictionary = ['A', 'B', 'C', 'R', 'E', 'F', 'G', 'D', 'H', 'J', 'K', 'L', 'M', 'N', 'O', 'U', 'u', 'b', 'r', 'f',
'd', 'j', 'l', 'n']
sacc_bins_two = [a+b for a in sacc_dictionary for b in sacc_dictionary]
sacc_bins_three = [a+b+c for a in sacc_dictionary for b in sacc_dictionary for c in sacc_dictionary]
sacc_bins_four = [a+b+c+d for a in sacc_dictionary for b in sacc_dictionary for c in sacc_dictionary for d in sacc_dictionary]
sacc_bins = [sacc_dictionary, sacc_bins_two, sacc_bins_three, sacc_bins_four]

saccFix_dictionary = ['S_lu', 'S_ld', 'S_lr', 'S_ll', 'S_su', 'S_sd', 'S_sr', 'S_sl', 'F_l', 'F_s']
saccFix_bins_two = [a+b for a in saccFix_dictionary for b in saccFix_dictionary]
saccFix_bins_three = [a+b+c for a in saccFix_dictionary for b in saccFix_dictionary for c in saccFix_dictionary]
saccFix_bins_four = [a+b+c+d for a in saccFix_dictionary for b in saccFix_dictionary for c in saccFix_dictionary for d in saccFix_dictionary]
saccFix_bins = [saccFix_dictionary, saccFix_bins_two, saccFix_bins_three, saccFix_bins_four]

def write_pami_feature_labels_to_file(targetfile):
f = open(targetfile, 'w') # creates if it does not exist
f.write(',short,long\n')
i = 0
for item1, item2 in zip(full_label_list, full_long_label_list):
f.write(str(i) + ',' + item1 + ',' + item2 + '\n')
i += 1
f.close()
Empty file added featureExtraction/__init__.py
Empty file.

0 comments on commit cebde1b

Please sign in to comment.