SVM_data_filtering.R


library(e1017)
library(Seurat)

load("161105_turtle_all_.Robj") # R object with all sequenced cells (unfiltered)

data.info<-turtle@data.info[,c(1:2, 8:12)] # create a data frame containing metadata
colnames(data.info)
# [1] "nGene"        "nUMI"         "percent.mito" "pCODING"      "pUTR"
# [6] "pINTRONIC"    "pINTERGENIC"  "nReads"

names <- rownames(turtle@data.info)
bad.names<-read.table("161105_bad_names.txt") # names of the cells in the bad quality cluster (cluster 6)
good.names<-names[!names %in% bad.names]

# select cells and data to train the classifier

train.bad.names<-sample(bad.names,100) # selects randomly 100 bad cells for training set
train.good.names<-sample(good.names,100) # selects randomly 100 good cells for training set

train.bad<-data.info[train.bad.names,]
train.good<-data.info[train.good.names,]
train<-rbind(train.good,train.bad)
ytrain <- matrix(c(rep(1,100),rep(-1,100)))

train.final = data.frame(train, ytrain=as.factor(ytrain))

# train the classifier

svmfit=svm(ytrain~., data=train.final, kernel="linear", cost=0.1, scale=F)

set.seed (1)
tune.out=tune(svm,ytrain~.,data=train.final,kernel="linear", ranges=list(cost=c(0.1,1,10,100, 500)))
summary(tune.out)
# this is choosing which is the best cost for the svm

bestmod=tune.out$best.model
summary(bestmod)

# build the test dataset

test.bad.names<-bad.names[!bad.names %in% train.bad.names]
test.good.names<-good.names[!good.names %in% train.good.names]
test.bad<-data.info[test.bad.names,]
test.good<-data.info[test.good.names,]
test<-rbind(test.good,test.bad)
test<-data.info[!rownames(data.info)%in% rownames(train),]

test<-as.matrix(test)
ytest<-matrix(c(rep(1,length(test.good.names)),rep(-1,length(test.bad.names))))

test.final = data.frame(test, ytest=as.factor(ytest))

ypred=predict(bestmod,test.final)
table(predict=ypred, truth=test.final$ytest)

ypred<-as.data.frame(ypred)

# add SVM classification to the seurat object
turtle = AddMetaData(turtle, ypred)

write.table(ypred, "161106_ypred_final.txt")

pred<-as.data.frame(turtle@data.info$ypred)
rownames(pred)<-rownames(turtle@data.info)
pred[train.bad.names,]<-"-1"
pred[train.good.names,]<-"1"
colnames(pred)<-c("svm_class")
turtle = AddMetaData(turtle, pred)
# now the turtle_all object has a svm_class column in the data.info slot, where the +1 and -1 classes are assigned also to the cells used for training.

pdf("SVM_classification.pdf")
DimPlot(turtle, reduction.use="tsne", group.by="svm_class", pt.size=0.7)
dev.off()

	library(e1017)
	library(Seurat)

	load("161105_turtle_all_.Robj") # R object with all sequenced cells (unfiltered)

	data.info<-turtle@data.info[,c(1:2, 8:12)] # create a data frame containing metadata
	colnames(data.info)
	# [1] "nGene" "nUMI" "percent.mito" "pCODING" "pUTR"
	# [6] "pINTRONIC" "pINTERGENIC" "nReads"

	names <- rownames(turtle@data.info)
	bad.names<-read.table("161105_bad_names.txt") # names of the cells in the bad quality cluster (cluster 6)
	good.names<-names[!names %in% bad.names]

	# select cells and data to train the classifier

	train.bad.names<-sample(bad.names,100) # selects randomly 100 bad cells for training set
	train.good.names<-sample(good.names,100) # selects randomly 100 good cells for training set

	train.bad<-data.info[train.bad.names,]
	train.good<-data.info[train.good.names,]
	train<-rbind(train.good,train.bad)
	ytrain <- matrix(c(rep(1,100),rep(-1,100)))

	train.final = data.frame(train, ytrain=as.factor(ytrain))

	# train the classifier

	svmfit=svm(ytrain~., data=train.final, kernel="linear", cost=0.1, scale=F)

	set.seed (1)
	tune.out=tune(svm,ytrain~.,data=train.final,kernel="linear", ranges=list(cost=c(0.1,1,10,100, 500)))
	summary(tune.out)
	# this is choosing which is the best cost for the svm

	bestmod=tune.out$best.model
	summary(bestmod)

	# build the test dataset

	test.bad.names<-bad.names[!bad.names %in% train.bad.names]
	test.good.names<-good.names[!good.names %in% train.good.names]
	test.bad<-data.info[test.bad.names,]
	test.good<-data.info[test.good.names,]
	test<-rbind(test.good,test.bad)
	test<-data.info[!rownames(data.info)%in% rownames(train),]

	test<-as.matrix(test)
	ytest<-matrix(c(rep(1,length(test.good.names)),rep(-1,length(test.bad.names))))

	test.final = data.frame(test, ytest=as.factor(ytest))

	ypred=predict(bestmod,test.final)
	table(predict=ypred, truth=test.final$ytest)

	ypred<-as.data.frame(ypred)

	# add SVM classification to the seurat object
	turtle = AddMetaData(turtle, ypred)

	write.table(ypred, "161106_ypred_final.txt")

	pred<-as.data.frame(turtle@data.info$ypred)
	rownames(pred)<-rownames(turtle@data.info)
	pred[train.bad.names,]<-"-1"
	pred[train.good.names,]<-"1"
	colnames(pred)<-c("svm_class")
	turtle = AddMetaData(turtle, pred)
	# now the turtle_all object has a svm_class column in the data.info slot, where the +1 and -1 classes are assigned also to the cells used for training.

	pdf("SVM_classification.pdf")
	DimPlot(turtle, reduction.use="tsne", group.by="svm_class", pt.size=0.7)
	dev.off()