From b35e3e90062d1e953866cde3b17752b1cd24f9e0 Mon Sep 17 00:00:00 2001 From: Tom Theile Date: Fri, 23 Aug 2019 16:48:04 +0200 Subject: [PATCH] model runs first time --- fromNotebook.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 fromNotebook.py diff --git a/fromNotebook.py b/fromNotebook.py new file mode 100644 index 0000000..1793219 --- /dev/null +++ b/fromNotebook.py @@ -0,0 +1,97 @@ + +import pandas as pd +import numpy as np +import pickle +from pathlib import Path + + + +print("-------------start-------------") +# For reproducibility +np.random.seed(1237) + +# Source file directory +path_data = "U:/data/" +fn_train = "train.csv" #"filewos_bib_random_nastates.csv" + + +trainingdata = pd.read_csv(path_data + fn_train, usecols=['CITY', 'INSTITUTION_FULL','ADDRESS_FULL','d_state']) #,'OBSERVATION' +# print(trainingdata.head(20)) + + +df = trainingdata.fillna('noInput') +#merge the input-columns into one big string column 'merged' +df["merged"] = df["CITY"].map(str) + ' ' + df["INSTITUTION_FULL"] + ' ' + df["ADDRESS_FULL"] + +df.head(9) + + + +# prepare data for keras: + +# lets take 80% data as training and remaining 20% for test. +train_size = int(len(df) * .8) + +train_input = df['merged'][:train_size] +train_tags = df['d_state'][:train_size] +#train_files_names = df['filename'][:train_size] + +test_input = df['merged'][train_size:] +test_tags = df['d_state'][train_size:] +#test_files_names = dtrainingata['filename'][train_size:] + +# 32 states and NA +num_labels = 25 +vocab_size = 5000 +batch_size = 100 + + +from keras.preprocessing.text import Tokenizer +from keras.models import Sequential +from keras.layers import Activation, Dense, Dropout + +print(train_input.head(10)) +print(train_tags[0:20]) +train_input.to_csv('inputprocessed.csv') + +# define Tokenizer with Vocab Size +tokenizer = Tokenizer(num_words=vocab_size) +tokenizer.fit_on_texts(train_input) + +x_train = tokenizer.texts_to_matrix(train_input, mode='tfidf') +x_test = tokenizer.texts_to_matrix(test_input, mode='tfidf') + +#import sci-kit... +from sklearn.preprocessing import LabelBinarizer +import sklearn.datasets as skds + + +encoder = LabelBinarizer() +encoder.fit(train_tags) +y_train = encoder.transform(train_tags) +y_test = encoder.transform(test_tags) + + + +print("and now the actual keras training:") + +model = Sequential() +model.add(Dense(512, input_shape=(vocab_size,))) +model.add(Activation('relu')) +model.add(Dropout(0.3)) +model.add(Dense(512)) +model.add(Activation('relu')) +model.add(Dropout(0.3)) +model.add(Dense(num_labels)) +model.add(Activation('softmax')) +model.summary() + +model.compile(loss='categorical_crossentropy', + optimizer='adam', + metrics=['accuracy']) + +history = model.fit(x_train, y_train, + batch_size=batch_size, + epochs=30, + verbose=1, + validation_split=0.1) \ No newline at end of file