Merge pull request loosolab#25 from loosolab/mqparser_tests

Added tests for parse_MaxQuant
HendrikSchultheis · Jul 16, 2018 · 1b98804 · 1b98804
2 parents 09b40ca + 4548826
commit 1b98804
Show file tree

Hide file tree

Showing 9 changed files with 675 additions and 6 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -49,5 +49,6 @@ Imports: shiny,
 RoxygenNote: 6.0.1
 biocViews: 
 Suggests: knitr,
-    rmarkdown
+    rmarkdown,
+    testthat
 VignetteBuilder: knitr
diff --git a/R/parser.R b/R/parser.R
@@ -25,6 +25,8 @@
 #' @param version pre-header information about version (optional)
 #' @param experiment_id pre-header information about experiment id (optional)
 #'
+#' @return TRUE on success
+#'
 #' @export
 parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduced, config = system.file("extdata", "parser_MaxQuant_config.json", package = "wilson"), delimiter = ";", format = NULL, version = NULL, experiment_id = NULL){
   if (missing(proteinGroups_in)) {
@@ -85,7 +87,7 @@ parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduce
   # @return String level of given column
   get_sample_level <- function(col_head, isSample, full_list) {
     # Get the level of all 'sample' columns.
-    # Default: level <- "sample"
+    # Default: level is "sample"
     if (grepl("Ratio", col_head, perl = TRUE)) {
       if (grepl("type", col_head, perl = TRUE)) return("feature")
       return("contrast")
@@ -165,7 +167,7 @@ parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduce
   # @param version version number
   # @param exp_id experiment id
   # @param pGroups data table protein groups file
-  write_clarion_file <- function(meta, out, format, version, exp_id, pGroups, delimiter){
+  write_clarion_file <- function(meta, out, format, version, exp_id, pGroups, delimiter) {
     to_append <- FALSE
     if (!missing(format)) {
       write(paste0("!format=", format), file = out, append = to_append)
@@ -188,10 +190,21 @@ parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduce
   # reading files in data tables
   proteinGroups <- data.table::fread(proteinGroups_in, header = TRUE, quote = "")
   summary_file <- data.table::fread(summary_in, header = TRUE)
-  meta_config <- rjson::fromJSON(file = config)
+
+  meta_config <- tryCatch({
+    rjson::fromJSON(file = config)
+  }, error = function(cond) {
+    stop("Could not read config file")
+  }, warning = function(w) {
+    stop("Could not read config file")
+  })
 
   # getting experiment names
-  exp_names <- (unique(summary_file[Experiment != "", Experiment]))
+  if ("Experiment" %in% colnames(summary_file)) {
+    exp_names <- unique(summary_file[Experiment != "", Experiment])
+  } else {
+    stop("wrong format on summary file: column \'Experiment\' misssing")
+  }
 
   meta <- get_meta_from_config(meta_config = meta_config)
 
@@ -203,6 +216,9 @@ parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduce
   sample_ary <- meta_config$type_array
   reduced_list <- meta_config$reduced_list
   full_sample_list <- c(sample_scores, sample_ratios, sample_probability, sample_category, sample_ary)
+  if (is.null(reduced_list)) {
+    stop("reduced_list is missing in config file")
+  }
 
   # get column names
   col_names <- colnames(proteinGroups)
@@ -214,7 +230,7 @@ parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduce
   # append rows to data table with metadata
   samples_list <- lapply(col_names, function(col_head) {
 
-    unlist(lapply(exp_names, function(name){
+    unlist(lapply(exp_names, function(name) {
       name_brackets <- paste0("\\Q", name)
       exp_regex <- paste0("\\Q ", name)
       sample_description <- strsplit(col_head, exp_regex)
@@ -286,6 +302,8 @@ parse_MaxQuant <- function(proteinGroups_in, summary_in, outfile, outfile_reduce
   # writing reduced CLARION file
   write_clarion_file(meta = meta_reduced, out = outfile_reduced, format = format,
                      version = version, exp_id = experiment_id, pGroups = proteinGroups, delimiter = delimiter)
+
+  return(TRUE)
 }
 
 #' Method to parse input file.

diff --git a/tests/testthat.R b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(wilson)
+
+test_check("wilson")
diff --git a/tests/testthat/fail_config.json b/tests/testthat/fail_config.json
@@ -0,0 +1,291 @@
+{
+  "meta": [
+  {
+    "col_name": "Protein IDs",
+    "level": "feature",
+    "type": "array",
+    "label": "IDs",
+    "sublabel": "proteins"
+  },
+  {
+    "col_name": "Majority protein IDs",
+    "level": "feature",
+    "type": "array",
+    "label": "IDs",
+    "sublabel": "majority protein"
+  },
+  {
+    "col_name": "Protein names",
+    "level": "feature",
+    "type": "array",
+    "label": "protein names",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Gene names",
+    "level": "feature",
+    "type": "array",
+    "label": "gene names",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Fasta headers",
+    "level": "feature",
+    "type": "array",
+    "label": "fasta headers",
+    "sublabel": ""
+  },
+  {
+    "col_name": "id",
+    "level": "feature",
+    "type": "unique_id",
+    "label": "unique identifier",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Peptide IDs",
+    "level": "feature",
+    "type": "array",
+    "label": "IDs",
+    "sublabel": "peptide"
+  },
+  {
+    "col_name": "Mod. peptide IDs",
+    "level": "feature",
+    "type": "array",
+    "label": "IDs",
+    "sublabel": "mod. peptide"
+  },
+  {
+    "col_name": "Evidence IDs",
+    "level": "feature",
+    "type": "array",
+    "label": "IDs",
+    "sublabel": "evidence"
+  },
+  {
+    "col_name": "MS/MS IDs",
+    "level": "feature",
+    "type": "array",
+    "label": "IDs",
+    "sublabel": "MS/MS"
+  },
+  {
+    "col_name": "Mol. weight [kDa]",
+    "level": "feature",
+    "type": "category",
+    "label": "Mol. weight [kDa]",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Sequence length",
+    "level": "feature",
+    "type": "category",
+    "label": "length",
+    "sublabel": "Sequence"
+  },
+  {
+    "col_name": "Sequence lengths",
+    "level": "feature",
+    "type": "array",
+    "label": "lengths",
+    "sublabel": "Sequence"
+  },
+  {
+    "col_name": "Reverse",
+    "level": "feature",
+    "type": "category",
+    "label": "Reverse",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Potential contaminant",
+    "level": "feature",
+    "type": "category",
+    "label": "Potential contaminant",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Oxidation (M) site IDs",
+    "level": "feature",
+    "type": "array",
+    "label": "IDs",
+    "sublabel": "Oxidation (M) site"
+  },
+  {
+    "col_name": "Oxidation (M) site positions",
+    "level": "feature",
+    "type": "array",
+    "label": "positions",
+    "sublabel": "Oxidation (M) site"
+  },
+  {
+    "col_name": "Phospho (STY) site IDs",
+    "level": "feature",
+    "type": "array",
+    "label": "IDs",
+    "sublabel": "Phospho (STY) site"
+  },
+  {
+    "col_name": "Phospho (STY) site positions",
+    "level": "feature",
+    "type": "array",
+    "label": "positions",
+    "sublabel": "Phospho (STY) site"
+  },
+  {
+    "col_name": "Peptide counts (all)",
+    "level": "condition",
+    "type": "array",
+    "label": "counts",
+    "sublabel": "all peptide"
+  },
+  {
+    "col_name": "Peptide counts (razor+unique)",
+    "level": "condition",
+    "type": "array",
+    "label": "counts",
+    "sublabel": "razor+unique peptides"
+  },
+  {
+    "col_name": "Peptide counts (unique)",
+    "level": "condition",
+    "type": "array",
+    "label": "counts",
+    "sublabel": "unique peptides"
+  },
+  {
+    "col_name": "Number of proteins",
+    "level": "condition",
+    "type": "score",
+    "label": "count",
+    "sublabel": "proteins"
+  },
+  {
+    "col_name": "Peptides",
+    "level": "condition",
+    "type": "score",
+    "label": "count",
+    "sublabel": "Peptides"
+  },
+  {
+    "col_name": "Razor + unique peptides",
+    "level": "condition",
+    "type": "score",
+    "label": "count",
+    "sublabel": "Razor + unique peptides"
+  },
+  {
+    "col_name": "Unique peptides",
+    "level": "condition",
+    "type": "score",
+    "label": "count",
+    "sublabel": "Unique peptides"
+  },
+  {
+    "col_name": "MS/MS count",
+    "level": "condition",
+    "type": "score",
+    "label": "count",
+    "sublabel": "MS/MS"
+  },
+  {
+    "col_name": "Fraction average",
+    "level": "condition",
+    "type": "score",
+    "label": "fraction",
+    "sublabel": "average"
+  },
+  {
+    "col_name": "Best MS/MS",
+    "level": "condition",
+    "type": "array",
+    "label": "MS/MS",
+    "sublabel": "best"
+  },
+  {
+    "col_name": "Intensity",
+    "level": "condition",
+    "type": "score",
+    "label": "Intensity",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Intensity L",
+    "level": "condition",
+    "type": "score",
+    "label": "Intensity",
+    "sublabel": "L"
+  },
+  {
+    "col_name": "Intensity M",
+    "level": "condition",
+    "type": "score",
+    "label": "Intensity",
+    "sublabel": "M"
+  },
+  {
+    "col_name": "Intensity H",
+    "level": "condition",
+    "type": "score",
+    "label": "Intensity",
+    "sublabel": "H"
+  },
+  {
+    "col_name": "Q-value",
+    "level": "condition",
+    "type": "probability",
+    "label": "q-value",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Score",
+    "level": "condition",
+    "type": "probability",
+    "label": "score",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Unique sequence coverage [%]",
+    "level": "condition",
+    "type": "ratio",
+    "label": "sequence coverage",
+    "sublabel": "Unique"
+  },
+  {
+    "col_name": "Unique + razor sequence coverage [%]",
+    "level": "condition",
+    "type": "ratio",
+    "label": "sequence coverage",
+    "sublabel": "Unique + razor"
+  },
+  {
+    "col_name": "Sequence coverage [%]",
+    "level": "condition",
+    "type": "ratio",
+    "label": "sequence coverage",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Only identified by site",
+    "level": "feature",
+    "type": "category",
+    "label": "identified by site",
+    "sublabel": ""
+  },
+  {
+    "col_name": "Peptide is razor",
+    "level": "condition",
+    "type": "array",
+    "label": "peptide",
+    "sublabel": "is razor"
+  }
+  ],
+
+  "type_scores":
+  ["Peptides", "Razor + unique peptides", "Unique peptides", "Ratio M/L count", "Ratio M/L iso-count", "Ratio H/L count", "Ratio H/L iso-count" ,
+   "Ratio H/M count", "Ratio H/M iso-count", "Intensity", "Intensity L", "Intensity M", "Intensity H", "MS/MS count", "LFQ intensity", "Reporter intensity count",
+    "Reporter intensity corrected", "Reporter intensity", "Fraction", "iBAQ"]
+
+}
+
diff --git a/tests/testthat/proteinGroups_test.txt b/tests/testthat/proteinGroups_test.txt
@@ -0,0 +1,2 @@
+Protein IDs	Majority protein IDs	Peptide counts (all)	Peptide counts (razor+unique)	Peptide counts (unique)	Protein names	Gene names	Fasta headers	Number of proteins	Peptides	Razor + unique peptides	Unique peptides	Peptides Exp1	Peptides Exp2	Razor + unique peptides Exp1	Razor + unique peptides Exp2	Unique peptides Exp1	Unique peptides Exp2	Sequence coverage [%]	Unique + razor sequence coverage [%]	Unique sequence coverage [%]	Mol. weight [kDa]	Sequence length	Sequence lengths	Fraction average	Fraction 1	Fraction 2	Fraction 3	Fraction 4	Fraction 5	Fraction 6	Fraction 7	Fraction 8	Q-value	Score	Reporter intensity corrected 0	Reporter intensity corrected 1	Reporter intensity corrected 2	Reporter intensity corrected 3	Reporter intensity corrected 4	Reporter intensity corrected 5	Reporter intensity 0	Reporter intensity 1	Reporter intensity 2	Reporter intensity 3	Reporter intensity 4	Reporter intensity 5	Reporter intensity count 0	Reporter intensity count 1	Reporter intensity count 2	Reporter intensity count 3	Reporter intensity count 4	Reporter intensity count 5	Reporter intensity corrected 0 Exp1	Reporter intensity corrected 1 Exp1	Reporter intensity corrected 2 Exp1	Reporter intensity corrected 3 Exp1	Reporter intensity corrected 4 Exp1	Reporter intensity corrected 5 Exp1	Reporter intensity corrected 0 Exp2	Reporter intensity corrected 1 Exp2	Reporter intensity corrected 2 Exp2	Reporter intensity corrected 3 Exp2	Reporter intensity corrected 4 Exp2	Reporter intensity corrected 5 Exp2	Reporter intensity 0 Exp1	Reporter intensity 1 Exp1	Reporter intensity 2 Exp1	Reporter intensity 3 Exp1	Reporter intensity 4 Exp1	Reporter intensity 5 Exp1	Reporter intensity 0 Exp2	Reporter intensity 1 Exp2	Reporter intensity 2 Exp2	Reporter intensity 3 Exp2	Reporter intensity 4 Exp2	Reporter intensity 5 Exp2	Reporter intensity count 0 Exp1	Reporter intensity count 1 Exp1	Reporter intensity count 2 Exp1	Reporter intensity count 3 Exp1	Reporter intensity count 4 Exp1	Reporter intensity count 5 Exp1	Reporter intensity count 0 Exp2	Reporter intensity count 1 Exp2	Reporter intensity count 2 Exp2	Reporter intensity count 3 Exp2	Reporter intensity count 4 Exp2	Reporter intensity count 5 Exp2	Sequence coverage Exp1 [%]	Sequence coverage Exp2 [%]	Intensity	Intensity Exp1	Intensity Exp2	MS/MS count	Only identified by site	Reverse	Potential contaminant	id	Peptide IDs	Peptide is razor	Mod. peptide IDs	Evidence IDs	MS/MS IDs	Best MS/MS	Oxidation (M) site IDs	Oxidation (M) site positions
+A0A068BEQ2;P50171;P50171-2;G3UX44	A0A068BEQ2;P50171;P50171-2;G3UX44	11;11;11;10	11;11;11;10	11;11;11;10	Estradiol 17-beta-dehydrogenase 8	H2-Ke6;Hsd17b8	tr|A0A068BEQ2|A0A068BEQ2_MOUSE H2-K region expressed gene 6, isoform CRA_a OS=Mus musculus GN=H2-Ke6 PE=2 SV=1;sp|P50171|DHB8_MOUSE Estradiol 17-beta-dehydrogenase 8 OS=Mus musculus GN=Hsd17b8 PE=1 SV=2;sp|P50171-2|DHB8_MOUSE Isoform Long of Estradiol 17-b	4	11	11	11	9	9	9	9	9	9	61.4	61.4	61.4	26.587	259	259;259;274;234	6.32	2	1	1	3	4	4	5	17	0	309.9	422810	381560	394980	374200	414580	428320	405880	379740	401170	384810	418890	414670	29	29	29	29	29	29	159850	141420	149170	147480	162360	166190	262960	240140	245810	226710	252220	262130	153390	140950	151450	151320	164040	160930	252490	238790	249720	233490	254850	253730	12	12	12	12	12	12	17	17	17	17	17	17	46.3	51	8493600000	3090200000	5403400000	42				0	1707;5068;5754;17624;17838;18452;24153;26755;37315;45123;45334	True;True;True;True;True;True;True;True;True;True;True	1818;5449;5450;6176;18892;19121;19782;25924;28715;40406;48814;49037	5720;5721;5722;5723;5724;16176;16177;18100;56337;56338;56339;56340;57035;57036;57037;57038;57039;57040;59065;59066;59067;59068;77499;77500;85797;119780;119781;119782;119783;145479;145480;145481;145482;145483;146163;146164;146165	6359;6360;6361;6362;6363;6364;6365;6366;17918;17919;20019;62610;62611;62612;62613;63369;63370;63371;63372;63373;63374;65658;65659;65660;65661;86004;86005;86006;86007;95195;133113;133114;133115;133116;161672;161673;161674;161675;161676;162411;162412;162413	6366;17919;20019;62610;63370;65660;86006;95195;133113;161676;162411	0;1	113;204