parameter_STAR.in

#### all parameteres default are found at /groups/churchman/jd187/Program/STAR/STAR-STAR_2.4.0d/parametersDefault file ##############

### versions
#versionSTAR             020201
#    int>0: STAR release numeric ID. Please do not change this value!
#versionGenome           020101 020200
#    int>0: oldest value of the Genome version compatible with this STAR release. Please do not change this value!

### PARAMETERS
#parametersFiles          -
#    string: name of a user-defined parameters file, "-": none. Can only be defined on the command line.
# provided at the command line level

### RUN PARAMETERS
#runMode                         alignReads
#    string: type of the run:    alignReads      ... map reads
#                                genomeGenerate  ... generate genome files
#                                inputAlignmentsFromBAM    ... input alignments from BAM. Presently only works with --outWigType to generate wiggle files.

#runThreadN                      1
#    int: number of threads to run STAR
# provided at the command line level in case we need to tweek the memory settings etc

### GENOME PARAMETERS
#genomeDir                   ./GenomeDir/
#    string: path to the directory where genome files are stored (if runMode!=generateGenome) or will be generated (if runMode==generateGenome)
# provided at the command line level, in case we wanna provide a diff genome (f.ex. built without Jct) without changing the param file

#genomeLoad              NoSharedMemory
#    mode of shared memory usage for the genome files
#    string:        LoadAndKeep     ... load genome into shared and keep it in memory after run
#                   LoadAndRemove   ... load genome into shared but remove it after run
#                   LoadAndExit     ... load genome into shared memory and exit, keeping the genome in memory for future runs
#                   Remove          ... do not map anything, just remove loaded genome from memory
#                   NoSharedMemory  ... do not use shared memory, each job will have its own private copy of the genome

### GENOME GENERATION PARAMETERS
#genomeFastaFiles            -
#    fasta files with genomic sequence for genome files generation, only used if runMode==genomeGenerate
#    string(s): path(s) to the files from the working directory (separated by spaces)

#genomeChrBinNbits           18
#    int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins

#genomeSAindexNbases         14
#    int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches.

#genomeSAsparseD             1
#    int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction


### INPUT FROM BAM
#inputBAMfile                -
#    string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM


### READ PARAMETERS
#readFilesIn                 Read1 Read2
#    string(s): paths to files that contain input read1 (and, if needed,  read2)
# given at the command line level as the read names is gonna depend on the sample name

readFilesCommand        cat
#    string(s): cmd line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout
#               For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc.

#readMapNumber               -1
#    int: number of reads to map from the beginning of the file
#                            -1: map all reads

#readMatesLengthsIn          NotEqual
#    string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same  / not the same. NotEqual is safe in all situations.

#clip3pNbases                 0
#    int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.
#
#clip5pNbases                 0
#    int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates.

clip3pAdapterSeq           ATCTCGTATGCCGTCTTCTGCTTG
#    string(s): adapter sequences to clip from 3p of each mate.  If one value is given, it will be assumed the same for both mates.

clip3pAdapterMMp            0.21
#    double(s): max proportion of mismatches for 3p adpater clipping for each mate. If one value is given, it will be assumed the same for both mates.
#

clip3pAfterAdapterNbases    1
#    int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates.


### LIMITS
#limitGenomeGenerateRAM               31000000000
#    int>0: maximum available RAM (bytes) for genome generation

limitIObufferSize                    200000000
#    int>0: max available buffers size (bytes) for input/output, per thread

#limitOutSAMoneReadBytes              100000
#int>0: max size of the SAM record for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax)

#limitOutSJoneRead                    1000
#    int>0: max number of junctions for one read (including all multi-mappers)

#limitOutSJcollapsed                  1000000
#    int>0: max number of collapsed junctions

limitBAMsortRAM                         64000000000
#    int>=0: maximum available RAM for sorting BAM. If =0, it will be set to the genome index size


### OUTPUT: GENERAL
#outFileNamePrefix               ./
#    string: output files name prefix (including full or relative path). Can only be defined on the command line.
#   we define it on the command line as it is depending on the sample name

#outTmpDir                       -
#    string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed!
#            - the temp directory will default to outFileNamePrefix_tmp

#outStd                          Log
#    string: which output will be directed to stdout (standard out)
#      Log : log messages
#      SAM : alignments in .sam format (which normally are output to Aligned.out.sam file), normal std output will go into Log.std.out
#      BAM_Unsorted : alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted
#      BAM_SortedByCoordinate : alignments in BAM format, unsorted. Requires --outSAMtype BAM SortedByCoordinate
#      BAM_Quant : alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM

outReadsUnmapped                Fastx
#   string: output of unmapped reads (besides SAM)
#                                None    : no output
#                                Fastx   : output in separate fasta/fastq files, Unmapped.out.mate1/2

#outQSconversionAdd              0
#   int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31)


### OUTPUT: SAM/BAM
outSAMtype                      BAM SortedByCoordinate
#    strings: type of SAM/BAM output
#                                1st word:
#                                BAM : output BAM without sorting
#                                SAM : output SAM without sorting
#                                None : no SAM/BAM output
#                                2nd, 3rd ...
#                                Unsorted : standard unsorted
#                                SortedByCoordinate : sorted by coordinate

#outSAMmode                      Full
#    string: mode of SAM output  None : no SAM output
#                                Full : full SAM output
#                                NoQS : full SAM but without quality scores

#outSAMstrandField                               None
#    string: Cufflinks-like strand field flag    None        : not used
#                                                intronMotif : strand derived from the intron motif. Reads with inconsisent and/or non-canonical introns are filtered out.

#If you have stranded RNA-seq data, you do not need to use any specific STAR options. Instead, you need
#to run Cufflinks with the library option --library-type options. For example, cufflinks <…> -
#library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has
#to be used only for Cufflinks runs and not for STAR runs.

outSAMattributes                All
#    string: a string of desired SAM attributes, in the order desired for the output SAM
#                                NH HI AS nM NM MD jM jI XS
#                                Standard    : NH HI AS nM
#                                All         : NH HI AS nM NM MD jM jI
#                                None


#outSAMunmapped          None
#    string: output of unmapped reads in the SAM format
#                                None    : no output
#                                Within  : output unmapped reads within the main SAM file (i.e. Aligned.out.sam)

#outSAMorder                     Paired
#    string: type of sorting for the SAM output
#                                Paired: one mate after the other for all paired alignments
#                                PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files

#outSAMprimaryFlag               OneBestScore
#    string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG
#                                OneBestScore  : only one alignment with the best score is primary
#                                AllBestScore  : all alignments with the best score are primary

#outSAMreadID                    Standard
#    string: read ID record type
#                                Standard  : first word (until space) from the FASTx read ID line, removing /1,/2 from the end
#                                Number    : read number (index) in the FASTx file

#outSAMmapqUnique        255
#    int: 0 to 255: the MAPQ value for unique mappers

#outSAMattrRGline        -
#    string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:",
#            e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z".
#            xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted.
#            Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn.

#outSAMheaderHD          -
#    strings: @HD (header) line of the SAM header

#outSAMheaderPG          -
#    strings: extra @PG (software) line of the SAM header (in addition to STAR)
#
#outSAMheaderCommentFile -
#    string: path to the file with @CO (comment) lines of the SAM header
#
#outBAMcompression   -1
#    int: -1 ... 10  BAM compression level, -1=default compression, 0=no compression, 10=maximum compression
#
#bamRemoveDuplicatesType  -
#    string: remove duplicates from BAM file, for now only works with sorted BAM feeded with inputBAMfile
#           UniqueIdentical : removes all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical
#
#bamRemoveDuplicatesMate2basesN   0
#    int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE)
#

### OUTPUT WIGGLE
#outWigType          None
#    string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p"
#                    1st word:
#                    None
#                    bedGraph
#                    wiggle
#                    2nd word:
#                    read1_5p : signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc
#                    read2 : signal from only 2nd read
#
#outWigStrand        Stranded
#    string: strandedness of wigglle (bedGraph) output
#                    Stranded: separate strands, str1 and str2
#                    Unstranded: collapsed strands
#
#outWigReferencesPrefix    -
#    string: prefix matching reference names to include in the output wiggle file, e.g. "chr.*"
#    default: "-" - include all references
#
#outWigNorm              RPM
#    string: type of normalization for the signal
#                        RPM : reads per million of mapped reads
#                        None : no normalization, "raw" counts
#


### OUTPUT FILTERING
#outFilterType           Normal
#    string: type of filtering
#                                Normal: standard filtering using only current alignment
#                                BySJout: keep only those reads that contain junctions that passed filtering into SJ.out.tab

#outFilterMultimapScoreRange     1
#    int: the score range below the maximum score for multimapping alignments.
# as -10*log10(1-1/2) = MapQual for read mapping twice = 3.013, 3.013 -range 3
# keep all multi aligned reads (at least up to 100 times as -10*log10(1-1/100) =
# 0.04364805)... I think... Not sure I got it right though...

outFilterMultimapNmax   2
#    int: read alignments will be output only if the read maps fewer than this value, otherwise no alignments will be output

#outFilterMismatchNmax   10
#    int: alignment will be output only if it has fewer mismatches

#outFilterMismatchNoverLmax  0.3
#    int: alignment will be output only if its ratio of mismatches to *mapped* length is less than this value; to make it consistent with parmater used with TH

#outFilterMismatchNoverReadLmax  1
#    int: alignment will be output only if its ratio of mismatches to *read* length is less than this value

#outFilterScoreMin       0
#    int: alignment will be output only if its score is higher than this value

#outFilterScoreMinOverLread  0.66
#        float: outFilterScoreMin normalized to read length (sum of mates' lengths for paired-end reads)

#outFilterMatchNmin      0
#    int: alignment will be output only if the number of matched bases is higher than this value; to make it consistent with parmater used with TH

#outFilterMatchNminOverLread 0.66
#    float: outFilterMatchNmin normalized to read length (sum of mates' lengths for paired-end reads)
# I guess it has something to do with soft clipping ? I don't know, so I let the default value...

#outFilterIntronMotifs   None
#    string: filter alignment using their motifs
#           None : no filtering
#           RemoveNoncanonical : filter out alignments that contain non-canonical junctions
#           RemoveNoncanonicalUnannotated : filter out alignments that contain non-canonical unannotated junctions when using annotated
#                                           splice junctions database. The annotated non-canonical junctions will be kept.

### OUTPUT FILTERING: SPLICE JUNCTIONS
#outSJfilterReads        All
#    string: which reads to consider for collapsed splice junctions output
#                All: all reads, unique- and multi-mappers
#                Unique: uniquely mapping reads only

outSJfilterOverhangMin  3  1  1  1
#    4*int:    minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG motif,
#                       (3) GC/AG motif, (4) AT/AC motif. -1 means no output for that motif does not apply to annotated junctions

#outSJfilterCountUniqueMin   3   1   1   1
#    4*int: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG motif, (3) GC/AG motif,
#           (4) AT/AC motif. -1 means no output for that motif. Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions

#outSJfilterCountTotalMin    3   1   1   1
#    4*int: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG motif, (3) GC/AG motif, (4) AT/AC motif. -1 means no output for that motif Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions

outSJfilterDistToOtherSJmin 0  0   0   0
#    4*int>=0: minimum allowed distance to other junctions' donor/acceptor
#                                does not apply to annotated junctions

#outSJfilterIntronMaxVsReadN        50000 100000 200000
#    N*int>=0: maximum gap allowed for junctions supported by 1,2,3...N reads
#              i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4
#              reads any gap <=alignIntronMax does not apply to annotated junctions

### SCORING
#scoreGap                     0
#    gap open penalty

#scoreGapNoncan               -8
#    non-canonical gap open penalty (in addition to scoreGap)

#scoreGapGCAG                 -4
#    GCAG gap open penalty (in addition to scoreGap)

#scoreGapATAC                 -8
#    ATAC gap open penalty  (in addition to scoreGap)

#scoreGenomicLengthLog2scale   -0.25
#    extra score logarithmically scaled with genomic length of the alignment: -scoreGenomicLengthLog2scale*log2(genomicLength)

#scoreDelOpen                 -2
#    deletion open penalty

#scoreDelBase                 -2
#    deletion extension penalty per base (in addition to scoreDelOpen)

#scoreInsOpen                 -2
#    insertion open penalty

#scoreInsBase                 -2
#    insertion extension penalty per base (in addition to scoreInsOpen)

#scoreStitchSJshift           1
#    maximum score reduction while searching for SJ boundaries inthe stitching step


### ALIGNMENT and SEED PARAMETERS
#seedSearchStartLmax          50
#    int>0: defines the search start point through the read - the read is split into pieces no longer than this value

#seedSearchStartLmaxOverLread    1.0
#    float: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)

#seedSearchLmax          0
#    int>=0: defines the maximum length of the seeds, if =0 max seed lengthis infinite

#seedMultimapNmax        10000
#    int>0: only pieces that map fewer than this value are utilized in the stitching procedure

#seedPerReadNmax         1000
#    int>0: max number of seeds per read

#seedPerWindowNmax       50
#    int>0: max number of seeds per window

#seedNoneLociPerWindow   10
#    int>0: max number of one seed loci per window

alignIntronMin          11
#    minimum intron size: genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion
# modified specifically for tRNA introns

#alignIntronMax          0
#    maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins

#alignMatesGapMax            0
#    maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins

#alignSJoverhangMin          5
#    int>0: minimum overhang (i.e. block size) for spliced alignments

#alignSJDBoverhangMin        3
#    int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments

#alignSplicedMateMapLmin     0
#    int>0: minimum mapped length for a read mate that is spliced

#alignSplicedMateMapLminOverLmate 0.66
#    float>0: alignSplicedMateMapLmin normalized to mate length

#alignWindowsPerReadNmax     10000
#    int>0: max number of windows per read

#alignTranscriptsPerWindowNmax   100
#    int>0: max number of transcripts per window

#alignTranscriptsPerReadNmax     10000
#    max number of different alignments per read to consider

alignEndsType           EndToEnd
#    string: type of read ends alignment
#                        Local   : standard local alignment with soft-clipping allowed
#                        EndToEnd: force end-to-end read alignment, do not soft-clip

### SPLICE JUNCTIONS DATABASE PARAMETERS
#sjdbFileChrStartEnd                     -
#    string: path to the file with genomic coordinates (chr <tab> start <tab> end <tab> strand) for the introns
# we will provide it in the command line directly, as the  path is gonna depend on the sample
# NO: The annotations can be supplied in the form of splice junctions’ loci or GTF (or GFF3) file AND ipmortantly The annotations have to be supplied at the genome/suffix array generation step

#sjdbGTFfile                             -
#    string: path to the GTF file with annotations
# we will provide it in the command line directly

#sjdbGTFchrPrefix                        -
#    string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC geneomes)

#sjdbGTFfeatureExon                      exon
#    string: feature type in GTF file to be used as exons for building transcripts

#sjdbGTFtagExonParentTranscript          transcript_id
#    string: tag name to be used as exons' parents for building transcripts

#sjdbOverhang                            0
#    int>=0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)
#            if =0, splice junction database is not used

#sjdbScore                               2
#    int: extra alignment score for alignmets that cross database junctions

### WINDOWS, ANCHORS, BINNING
#winAnchorMultimapNmax           50
#    int>0: max number of loci anchors are allowed to map to

#winBinNbits                     16
#    int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering,each window will occupy an integer nb of bins.

#winAnchorDistNbins              9
#    int>0: max number of bins between two anchors that allows aggregation of anchors into one window

#winFlankNbins               4
#    int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window


### CHIMERIC ALIGNMENTS
#chimSegmentMin              0
##    int>0: minimum length of chimeric segment length, if ==0, no chimeric output

#chimScoreMin                0
##    int>0: minimum total (summed) score of the chimeric segments

#chimScoreDropMax            20
##    int>0: max drop (difference) of chimeric score (the sum of scores of all chimeric segements) from the read length

#chimScoreSeparation         10
##    int>0: minimum difference (separation) between the best chimeric score and the next one

#chimScoreJunctionNonGTAG    -1
##    int: penalty for a non-GT/AG chimeric junction

#chimJunctionOverhangMin     20
##    int>0: minimum overhang for a chimeric junction


### QUANTIFICATION OF ANNOTATIONS
#quantMode                   -
#    string(s): types of qunatification requested
#                            - : None
#                            TranscriptomeSAM : output SAM/BAM alignments to transcriptome into a separate file


#### 2-PASS
#twopass1readsN              0
#    int>0: number of reads to process for the 1st step. 0 : 1-step only, no 2nd pass; use very large number to map all reads in the first step
#
#twopassSJlimit              1000000
#    int>=0: maximum number of junction detected in the 1st step
#