diff --git a/.gitignore b/.gitignore index af6d502..520f5dc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +*.csv +*.png + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/fromNotebook.py b/fromNotebook.py index 32d948c..2bfd54b 100644 --- a/fromNotebook.py +++ b/fromNotebook.py @@ -6,7 +6,7 @@ import matplotlib.pyplot as plt -print("-------------start-------------") +print("\n -------------start-------------") # For reproducibility np.random.seed(1237) @@ -24,9 +24,9 @@ #merge the input-columns into one big string column 'merged' df["merged"] = df["CITY"].map(str) + ' ' + df["INSTITUTION_FULL"] + ' ' + df["ADDRESS_FULL"] -df.head(9) - - +print(df.head(9)) +df.to_csv('merged.csv') +df.reindex(np.random.permutation(df.index)) # shuffle the rows, because I suspect some row-dependence... # prepare data for keras: @@ -43,7 +43,7 @@ # 32 states and NA num_labels = 23 # there are only 25 states with institiutions in the dataset -vocab_size = 10000 +vocab_size = 3000 # how many different words to keep? batch_size = 150 from keras.preprocessing.text import Tokenizer @@ -81,12 +81,12 @@ print("\n\n ---------- and now the actual keras training: ----------- \n\n") model = Sequential() -model.add(Dense(int(512/3), input_shape=(vocab_size,))) +model.add(Dense(int(512/2), input_shape=(vocab_size,))) +model.add(Activation('relu')) +model.add(Dropout(0.4)) +model.add(Dense(int(256/2))) model.add(Activation('relu')) -model.add(Dropout(0.3)) -#model.add(Dense(int(256/2))) -#model.add(Activation('relu')) -#model.add(Dropout(0.3)) +model.add(Dropout(0.2)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary() @@ -135,7 +135,8 @@ #print(test_files_names.iloc[i]) #print(f"\n {i} input-text: {train_input[i]} " )#+ train_input[i]) #print(f"x_train {i}: {x_train[i]},\n y_train i: {y_train[i]}") - print('Label:' + test_tags.iloc[i].ljust(19) + " -Predicted label: " + predicted_label.ljust(19) + f" {np.max(prediction[0])}") + print('Label:' + test_tags.iloc[i].ljust(19) + " -Predicted label: " + + predicted_label.ljust(19) + f" {np.max(prediction[0]):4.2f}") #print(prediction) print(text_labels) \ No newline at end of file diff --git a/model.png b/model.png index a53959a..eb05c04 100644 Binary files a/model.png and b/model.png differ