Skip to content
Permalink
c01dc3cc52
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
113 lines (102 sloc) 3.92 KB
'''
Created on Jul 1, 2019
@author: cxchu
'''
# -*- coding: utf-8 -*-
from sklearn.externals import joblib
import pickle
import numpy as np
import sys
def create_dataset(corpus_path,label2id,word2id,feature2id):
num_of_labels = len(label2id.values())
num_of_samples = sum(1 for line in open(corpus_path))
storage = []
#data = np.zeros((num_of_samples,4+70+num_of_labels),"int32")
data = np.zeros((num_of_samples,4+num_of_labels),"int32")
s_start_pointer = 0
num = 0
with open(corpus_path) as f:
for line in f:
if len(line.split("\t")) != 5:
continue
(start,end,words,labels,features) = line.strip().split("\t")
labels, words, features = labels.split(), words.split(), features.split()
length = len(words)
start, end = int(start), int(end)
labels_code = [0 for i in range(num_of_labels)]
for label in labels:
if label in label2id:
labels_code[label2id[label]] = 1
words_code = [word2id[word] if word in word2id else word2id["unk"] for word in words]
features_code = [feature2id[feature] for feature in features]
storage += words_code
data[num,0] = s_start_pointer # s_start
data[num,1] = s_start_pointer + length # s_end
data[num,2] = s_start_pointer + start # e_start
data[num,3] = s_start_pointer + end # e_end
#data[num,4:4+len(features_code)] = np.array(features_code)
data[num,4:] = labels_code
s_start_pointer += length
num += 1
if num % 100000 == 0:
print(num)
return np.array(storage,"int32"), data
def create_raw_dataset(label2id,word2id,feature2id):
num_of_labels = len(label2id.values())
# num_of_samples = sum(1 for line in open(corpus_path))
storage = []
# data = np.zeros((num_of_samples,4+70+num_of_labels),"int32")
# data = np.zeros((num_of_samples,4+num_of_labels),"int32")
s_start_pointer = 0
num = 0
sentences = []
mentions = []
lines = []
print('input')
sys.stdout.flush()
line = sys.stdin.readline()
while line != 'end':
lines.append(line.strip())
line = sys.stdin.readline()
line = line.strip()
print('get all input')
sys.stdout.flush()
data = np.zeros((len(lines),4+num_of_labels),"int32")
for line in lines:
if len(line.split("\t")) != 3:
continue
(start,end,words) = line.strip().split("\t")
sentences.append(words)
words = words.split()
length = len(words)
start, end = int(start), int(end)
if start == end:
mention = words[start]
else:
mention = " ".join([words[i+start] for i in range(end-start)])
mentions.append(mention)
labels_code = [0 for _ in range(num_of_labels)]
words_code = [word2id[word] if word in word2id else word2id["unknown"] for word in words]
storage += words_code
data[num,0] = s_start_pointer # s_start
data[num,1] = s_start_pointer + length # s_end
data[num,2] = s_start_pointer + start # e_start
data[num,3] = s_start_pointer + end # e_end
# data[num,4:4+len(features_code)] = np.array(features_code)
# data[num,74:] = labels_code
data[num,4:] = labels_code
s_start_pointer += length
num += 1
if num % 100000 == 0:
print(num)
return np.array(storage,"int32"), data, sentences, mentions
def main():
dicts = joblib.load(sys.argv[1])
label2id = dicts["label2id"]
word2id = dicts["word2id"]
feature2id = dicts["feature2id"]
storage,data = create_dataset(sys.argv[2],label2id,word2id,feature2id)
dataset = {"storage":storage,"data":data}
joblib.dump(dataset,sys.argv[3])
if(__name__=='__main__'):
main()