Skip to content
Navigation Menu
Toggle navigation
Sign in
In this repository
All GitHub Enterprise
↵
Jump to
↵
No suggested jump to results
In this repository
All GitHub Enterprise
↵
Jump to
↵
In this user
All GitHub Enterprise
↵
Jump to
↵
In this repository
All GitHub Enterprise
↵
Jump to
↵
Sign in
Reseting focus
You signed in with another tab or window.
Reload
to refresh your session.
You signed out in another tab or window.
Reload
to refresh your session.
You switched accounts on another tab or window.
Reload
to refresh your session.
Dismiss alert
{{ message }}
theile
/
neuralMexicanStateClassifier
Public
Notifications
You must be signed in to change notification settings
Fork
0
Star
1
Code
Issues
2
Pull requests
0
Actions
Projects
0
Wiki
Security
Insights
Additional navigation options
Code
Issues
Pull requests
Actions
Projects
Wiki
Security
Insights
Files
57baf3c
.vscode
.gitattributes
.gitignore
Untitled.ipynb
fromNotebook.py
inputprocessed.csv
keras1.py
model.png
untitled
wos_bib_random_nastates.csv
Breadcrumbs
neuralMexicanStateClassifier
/
fromNotebook.py
Blame
Blame
Latest commit
History
History
155 lines (115 loc) · 4.48 KB
Breadcrumbs
neuralMexicanStateClassifier
/
fromNotebook.py
Top
File metadata and controls
Code
Blame
155 lines (115 loc) · 4.48 KB
Raw
import pandas as pd import numpy as np import pickle from pathlib import Path import matplotlib.pyplot as plt print("\n -------------start-------------") # For reproducibility np.random.seed(1237) # Source file directory path_data = "U:/data/" fn_train = "train4-womexico.csv" #"train3.csv" #"filewos_bib_random_nastates.csv" trainingdata = pd.read_csv(path_data + fn_train, usecols=['CITY', 'INSTITUTION_FULL','ADDRESS_FULL','d_state']) #,'OBSERVATION' trainingdata = trainingdata[0:2200] #trainingdata.head(10) df = trainingdata.fillna('noInput') #merge the input-columns into one big string column 'merged' df["merged"] = df["CITY"].map(str) + ' ' + df["INSTITUTION_FULL"] + ' ' + df["ADDRESS_FULL"] print(df.head(9)) df.to_csv('merged.csv') df.reindex(np.random.permutation(df.index)) # shuffle the rows, because I suspect some row-dependence... undo for debugging # prepare data for keras: # lets take 80% data as training and remaining 20% for test. train_size = int(len(df) * .8) train_input = df['merged'][:train_size] train_tags = df['d_state'][:train_size] #train_files_names = df['filename'][:train_size] test_input = df['merged'][train_size:] test_tags = df['d_state'][train_size:] #test_files_names = dtrainingata['filename'][train_size:] # 32 states and NA num_labels = 26 # there are only 25 states with institiutions in the dataset vocab_size = 16000 # how many different words to keep? batch_size = 150 from keras.preprocessing.text import Tokenizer from keras.models import Sequential from keras.layers import Activation, Dense, Dropout, Conv1D, MaxPooling1D,GlobalMaxPool1D from keras.constraints import maxnorm from keras.utils import plot_model, to_categorical #print(train_input.head(10)) #print(train_tags[0:20]) #train_input.to_csv('inputprocessed.csv') # define Tokenizer with Vocab Size tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(train_input) x_train = tokenizer.texts_to_matrix(train_input, mode='tfidf') print(x_train[0]) print(f"x_Train.len:{len(x_train[0])}") x_test = tokenizer.texts_to_matrix(test_input, mode='tfidf') #import sci-kit... from sklearn.preprocessing import LabelBinarizer import sklearn.datasets as skds encoder = LabelBinarizer() encoder.fit(train_tags) y_train = encoder.transform(train_tags) y_test = encoder.transform(test_tags) #print("train tags_ ", train_tags[0:20]) #y_train = to_categorical(train_tags, num_labels) #y_test = to_categorical(test_tags, num_labels) text_labels = encoder.classes_ print("y_train: ", y_train[0:10]) for ls in range(10): print("y_train argmax",(np.argmax(y_train[ls]))) print("y_test", y_test[0:10]) print("labels: ", text_labels) #exit() print("\n\n ---------- and now the actual keras training: ----------- \n\n") model = Sequential() model.add(Dense(int(100), input_shape=(vocab_size,))) model.add(Activation('relu')) model.add(Dropout(0.5)) #model.add(Dense(int(200))) #model.add(Activation('relu')) #model.add(Dropout(0.3)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary() model.compile(loss='mean_squared_error', #'categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=batch_size, epochs=10, verbose=1, validation_split=0.2) # Plot training & validation accuracy values plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() # Plot training & validation loss values plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) print('Test accuracy:', score[1]) text_labels = encoder.classes_ plot_model(model, to_file='model.png') for i in range(100,160): prediction = model.predict(np.array([x_test[i]])) predicted_label = text_labels[np.argmax(prediction[0])] #print(test_files_names.iloc[i]) #print(f"\n {i} input-text: {train_input[i]} " )#+ train_input[i]) #print(f"x_train {i}: {x_train[i]},\n y_train i: {y_train[i]}") print('Label:' + test_tags.iloc[i].ljust(19) + " -Predicted label: " + predicted_label.ljust(19) + f" {np.max(prediction[0]):4.2f}") #print(prediction) print(text_labels)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
You can’t perform that action at this time.