From b516c32fbe470246e12d1816539eec298f417cbd Mon Sep 17 00:00:00 2001 From: Tom Theile Date: Mon, 26 Aug 2019 11:35:00 +0200 Subject: [PATCH] input was wrong --- fromNotebook.py | 71 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/fromNotebook.py b/fromNotebook.py index 7190786..a48e330 100644 --- a/fromNotebook.py +++ b/fromNotebook.py @@ -4,7 +4,7 @@ import pickle from pathlib import Path - +import matplotlib.pyplot as plt print("-------------start-------------") # For reproducibility @@ -12,11 +12,12 @@ # Source file directory path_data = "U:/data/" -fn_train = "train.csv" #"filewos_bib_random_nastates.csv" +fn_train = "train3.csv" #"filewos_bib_random_nastates.csv" trainingdata = pd.read_csv(path_data + fn_train, usecols=['CITY', 'INSTITUTION_FULL','ADDRESS_FULL','d_state']) #,'OBSERVATION' -# print(trainingdata.head(20)) +trainingdata = trainingdata[0:1500] +print(trainingdata.head(10)) df = trainingdata.fillna('noInput') @@ -41,24 +42,27 @@ #test_files_names = dtrainingata['filename'][train_size:] # 32 states and NA -num_labels = 25 -vocab_size = 5000 +num_labels = 23 # there are only 25 states with institiutions in the dataset +vocab_size = 20000 batch_size = 100 - from keras.preprocessing.text import Tokenizer from keras.models import Sequential from keras.layers import Activation, Dense, Dropout +from keras.constraints import maxnorm +from keras.utils import plot_model -print(train_input.head(10)) -print(train_tags[0:20]) -train_input.to_csv('inputprocessed.csv') + +#print(train_input.head(10)) +#print(train_tags[0:20]) +#train_input.to_csv('inputprocessed.csv') # define Tokenizer with Vocab Size tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(train_input) x_train = tokenizer.texts_to_matrix(train_input, mode='tfidf') +x_train.head(10) x_test = tokenizer.texts_to_matrix(test_input, mode='tfidf') #import sci-kit... @@ -72,32 +76,49 @@ y_test = encoder.transform(test_tags) +#exit() -print("and now the actual keras training:") +print("\n\n ---------- and now the actual keras training: ----------- \n\n") model = Sequential() -model.add(Dense(512, input_shape=(vocab_size,))) +model.add(Dense(int(512/2), input_shape=(vocab_size,), kernel_constraint=maxnorm(4))) model.add(Activation('relu')) -model.add(Dropout(0.3)) -model.add(Dense(512)) +model.add(Dropout(0.5)) +model.add(Dense(int(256/2), kernel_constraint=maxnorm(4))) model.add(Activation('relu')) -model.add(Dropout(0.3)) +model.add(Dropout(0.5)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary() -model.compile(loss='categorical_crossentropy', +model.compile(loss='categorical_crossentropy',#'mean_squared_error', #'categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=batch_size, - epochs=48, + epochs=15, verbose=1, - validation_split=0.1) - - - + validation_split=0.2) + + +# Plot training & validation accuracy values +plt.plot(history.history['acc']) +plt.plot(history.history['val_acc']) +plt.title('Model accuracy') +plt.ylabel('Accuracy') +plt.xlabel('Epoch') +plt.legend(['Train', 'Test'], loc='upper left') +plt.show() + +# Plot training & validation loss values +plt.plot(history.history['loss']) +plt.plot(history.history['val_loss']) +plt.title('Model loss') +plt.ylabel('Loss') +plt.xlabel('Epoch') +plt.legend(['Train', 'Test'], loc='upper left') +plt.show() score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) @@ -106,12 +127,16 @@ text_labels = encoder.classes_ -for i in range(100): +plot_model(model, to_file='model.png') + +for i in range(50): prediction = model.predict(np.array([x_test[i]])) predicted_label = text_labels[np.argmax(prediction[0])] #print(test_files_names.iloc[i]) - print('Actual label:' + test_tags.iloc[i]) - print("Predicted label: " + predicted_label) + #print(f"\n {i} input-text: {train_input[i]} " )#+ train_input[i]) + #print(f"x_train {i}: {x_train[i]},\n y_train i: {y_train[i]}") + print('--Actual label:' + test_tags.iloc[i]) + print("Predicted label: " + predicted_label + "--") \ No newline at end of file