diff --git a/Preprocessing_quality_control/SVM_data_filtering.R b/Preprocessing_quality_control/SVM_data_filtering.R new file mode 100644 index 0000000..a6a71e7 --- /dev/null +++ b/Preprocessing_quality_control/SVM_data_filtering.R @@ -0,0 +1,74 @@ + +library(e1017) +library(Seurat) + +load("161105_turtle_all_.Robj") # R object with all sequenced cells (unfiltered) + +data.info<-turtle@data.info[,c(1:2, 8:12)] # create a data frame containing metadata +colnames(data.info) +# [1] "nGene" "nUMI" "percent.mito" "pCODING" "pUTR" +# [6] "pINTRONIC" "pINTERGENIC" "nReads" + +names <- rownames(turtle@data.info) +bad.names<-read.table("161105_bad_names.txt") # names of the cells in the bad quality cluster (cluster 6) +good.names<-names[!names %in% bad.names] + +# select cells and data to train the classifier + +train.bad.names<-sample(bad.names,100) # selects randomly 100 bad cells for training set +train.good.names<-sample(good.names,100) # selects randomly 100 good cells for training set + +train.bad<-data.info[train.bad.names,] +train.good<-data.info[train.good.names,] +train<-rbind(train.good,train.bad) +ytrain <- matrix(c(rep(1,100),rep(-1,100))) + +train.final = data.frame(train, ytrain=as.factor(ytrain)) + +# train the classifier + +svmfit=svm(ytrain~., data=train.final, kernel="linear", cost=0.1, scale=F) + +set.seed (1) +tune.out=tune(svm,ytrain~.,data=train.final,kernel="linear", ranges=list(cost=c(0.1,1,10,100, 500))) +summary(tune.out) +# this is choosing which is the best cost for the svm + +bestmod=tune.out$best.model +summary(bestmod) + +# build the test dataset + +test.bad.names<-bad.names[!bad.names %in% train.bad.names] +test.good.names<-good.names[!good.names %in% train.good.names] +test.bad<-data.info[test.bad.names,] +test.good<-data.info[test.good.names,] +test<-rbind(test.good,test.bad) +test<-data.info[!rownames(data.info)%in% rownames(train),] + +test<-as.matrix(test) +ytest<-matrix(c(rep(1,length(test.good.names)),rep(-1,length(test.bad.names)))) + +test.final = data.frame(test, ytest=as.factor(ytest)) + +ypred=predict(bestmod,test.final) +table(predict=ypred, truth=test.final$ytest) + +ypred<-as.data.frame(ypred) + +# add SVM classification to the seurat object +turtle = AddMetaData(turtle, ypred) + +write.table(ypred, "161106_ypred_final.txt") + +pred<-as.data.frame(turtle@data.info$ypred) +rownames(pred)<-rownames(turtle@data.info) +pred[train.bad.names,]<-"-1" +pred[train.good.names,]<-"1" +colnames(pred)<-c("svm_class") +turtle = AddMetaData(turtle, pred) +# now the turtle_all object has a svm_class column in the data.info slot, where the +1 and -1 classes are assigned also to the cells used for training. + +pdf("SVM_classification.pdf") +DimPlot(turtle, reduction.use="tsne", group.by="svm_class", pt.size=0.7) +dev.off() \ No newline at end of file