Skip to content

Commit

Permalink
model runs first time
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom Theile committed Aug 23, 2019
1 parent d2df9ce commit b35e3e9
Showing 1 changed file with 97 additions and 0 deletions.
97 changes: 97 additions & 0 deletions fromNotebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@

import pandas as pd
import numpy as np
import pickle
from pathlib import Path



print("-------------start-------------")
# For reproducibility
np.random.seed(1237)

# Source file directory
path_data = "U:/data/"
fn_train = "train.csv" #"filewos_bib_random_nastates.csv"


trainingdata = pd.read_csv(path_data + fn_train, usecols=['CITY', 'INSTITUTION_FULL','ADDRESS_FULL','d_state']) #,'OBSERVATION'
# print(trainingdata.head(20))


df = trainingdata.fillna('noInput')
#merge the input-columns into one big string column 'merged'
df["merged"] = df["CITY"].map(str) + ' ' + df["INSTITUTION_FULL"] + ' ' + df["ADDRESS_FULL"]

df.head(9)



# prepare data for keras:

# lets take 80% data as training and remaining 20% for test.
train_size = int(len(df) * .8)

train_input = df['merged'][:train_size]
train_tags = df['d_state'][:train_size]
#train_files_names = df['filename'][:train_size]

test_input = df['merged'][train_size:]
test_tags = df['d_state'][train_size:]
#test_files_names = dtrainingata['filename'][train_size:]

# 32 states and NA
num_labels = 25
vocab_size = 5000
batch_size = 100


from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout

print(train_input.head(10))
print(train_tags[0:20])
train_input.to_csv('inputprocessed.csv')

# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_input)

x_train = tokenizer.texts_to_matrix(train_input, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_input, mode='tfidf')

#import sci-kit...
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds


encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)



print("and now the actual keras training:")

model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])

history = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=30,
verbose=1,
validation_split=0.1)

0 comments on commit b35e3e9

Please sign in to comment.