update file

MPIBR · May 1, 2018 · 50f1afc · 50f1afc
1 parent 652f6c8
commit 50f1afc
Showing 1 changed file with 74 additions and 0 deletions.
diff --git a/Preprocessing_quality_control/SVM_data_filtering.R b/Preprocessing_quality_control/SVM_data_filtering.R
@@ -0,0 +1,74 @@
+
+library(e1017)
+library(Seurat)
+
+load("161105_turtle_all_.Robj") # R object with all sequenced cells (unfiltered)
+
+data.info<-turtle@data.info[,c(1:2, 8:12)] # create a data frame containing metadata
+colnames(data.info)
+# [1] "nGene"        "nUMI"         "percent.mito" "pCODING"      "pUTR"
+# [6] "pINTRONIC"    "pINTERGENIC"  "nReads"
+
+names <- rownames(turtle@data.info) 
+bad.names<-read.table("161105_bad_names.txt") # names of the cells in the bad quality cluster (cluster 6) 
+good.names<-names[!names %in% bad.names]
+
+# select cells and data to train the classifier
+
+train.bad.names<-sample(bad.names,100) # selects randomly 100 bad cells for training set
+train.good.names<-sample(good.names,100) # selects randomly 100 good cells for training set
+
+train.bad<-data.info[train.bad.names,]
+train.good<-data.info[train.good.names,]
+train<-rbind(train.good,train.bad)
+ytrain <- matrix(c(rep(1,100),rep(-1,100)))
+
+train.final = data.frame(train, ytrain=as.factor(ytrain))
+
+# train the classifier
+
+svmfit=svm(ytrain~., data=train.final, kernel="linear", cost=0.1, scale=F) 
+
+set.seed (1)
+tune.out=tune(svm,ytrain~.,data=train.final,kernel="linear", ranges=list(cost=c(0.1,1,10,100, 500)))
+summary(tune.out)
+# this is choosing which is the best cost for the svm
+
+bestmod=tune.out$best.model
+summary(bestmod)
+
+# build the test dataset 
+
+test.bad.names<-bad.names[!bad.names %in% train.bad.names]
+test.good.names<-good.names[!good.names %in% train.good.names]
+test.bad<-data.info[test.bad.names,]
+test.good<-data.info[test.good.names,]
+test<-rbind(test.good,test.bad)
+test<-data.info[!rownames(data.info)%in% rownames(train),]
+
+test<-as.matrix(test)
+ytest<-matrix(c(rep(1,length(test.good.names)),rep(-1,length(test.bad.names))))
+
+test.final = data.frame(test, ytest=as.factor(ytest))
+
+ypred=predict(bestmod,test.final)
+table(predict=ypred, truth=test.final$ytest)
+
+ypred<-as.data.frame(ypred)
+
+# add SVM classification to the seurat object
+turtle = AddMetaData(turtle, ypred)
+
+write.table(ypred, "161106_ypred_final.txt")
+
+pred<-as.data.frame(turtle@data.info$ypred)
+rownames(pred)<-rownames(turtle@data.info)
+pred[train.bad.names,]<-"-1"
+pred[train.good.names,]<-"1"
+colnames(pred)<-c("svm_class")
+turtle = AddMetaData(turtle, pred)
+# now the turtle_all object has a svm_class column in the data.info slot, where the +1 and -1 classes are assigned also to the cells used for training. 
+
+pdf("SVM_classification.pdf")
+DimPlot(turtle, reduction.use="tsne", group.by="svm_class", pt.size=0.7)
+dev.off()