maxplanck-ie
diff --git a/‎Snakefile‎
Lines changed: 3 additions & 3 deletions b/‎Snakefile‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎configs/config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎configs/config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎organisms/hg38.yaml‎
Lines changed: 1 addition & 0 deletions b/‎organisms/hg38.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎organisms/mm39.yaml‎
Lines changed: 1 addition & 0 deletions b/‎organisms/mm39.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎rscripts/edgeR_gene.Rmd‎
Lines changed: 151 additions & 0 deletions b/‎rscripts/edgeR_gene.Rmd‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎rscripts/edgeR_gene_allele.Rmd‎
Lines changed: 162 additions & 0 deletions b/‎rscripts/edgeR_gene_allele.Rmd‎
Lines changed: 162 additions & 0 deletions
@@ -48,7 +48,7 @@ if not fromBam:
         expand("oarfish_output/{sample}.meta_info.json",sample=samples),
         expand("oarfish_output/{sample}.infreps.pq",sample=samples)]
     if config["sampleSheet"]:
-        req_files.append("edgeR_output/report.html")
+        req_files.append(["edgeR_transcript_output/report.html","edgeR_gene_output/report.html"])
 else:
     include: "snakefiles/make_allelic_reads.smk"
     include: "snakefiles/oarfish_allelic.smk"
@@ -62,8 +62,8 @@ else:
         expand("oarfish_output/{sample}_{allele}.infreps.pq",sample=samples,allele=alleles)
     ]
     if config["sampleSheet"]:
-        req_files.append(["edgeR_allele_output/report.html","edgeR_allele_condition_output/report.html"])
-
+        req_files.append(["edgeR_transcript_allele_output/report.html","edgeR_transcript_allele_condition_output/report.html"])
+        req_files.append(["edgeR_gene_allele_output/report.html","edgeR_gene_allele_condition_output/report.html"])
 
 
 rule all:
 
@@ -3,4 +3,4 @@ fromBam: False
 phasedVcf:
 sampleSheet: /data/manke/sikora/oarfish_miniworkshop/oarfish_test_data/sampleSheet.tsv
 organism: hg38
-
+correctRTA: False
@@ -93,3 +93,4 @@ ignoreForNormalization: chrX chrY chrM GL000008.2 GL000009.2 GL000194.1 GL000195
 known_splicesites: /data/repository/organisms/GRCh38_gencode_40/gencode/release-40/HISAT2/genome.ss
 star_index: /data/repository/organisms/GRCh38_gencode_40/Indices/STAR_2.7.10
 rmsk_file: /data/repository/organisms/GRCh38_gencode_40/repeatMasker/genome.fa.tbl
+t2g: /data/repository/organisms/GRCh38_gencode_40/gencode/release-40/transcript2gene.txt
@@ -21,3 +21,4 @@ ignoreForNormalization: MT X Y JH584299.1 GL456233.2 JH584301.1 GL456211.1 GL456
 known_splicesites: /data/repository/organisms/GRCm39_ensembl_106/ensembl/release-106/HISAT2/genome.ss
 rmsk_file: /data/repository/organisms/GRCm39_ensembl_106/repeatMasker/genome.fa.tbl
 star_index: /data/repository/organisms/GRCm39_ensembl_106/Indices/STAR_2.7.10
+t2g: /data/repository/organisms/GRCm39_ensembl_106/ensembl/release-106/transcript2gene.txt
@@ -0,0 +1,151 @@
+---
+title: "edgeR"
+author: "Katarzyna Sikora"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+wd<-file.path(snakemake@params[["basedir"]],snakemake@params[["outdir"]])
+system(paste0('mkdir -p ',wd))
+knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_knit$set(root.dir = wd)
+```
+
+## Libs
+
+```{r libs}
+library(tximport)
+library(edgeR)
+library(GenomicRanges)
+```
+
+## Import counts and summarize gene-wise
+
+Original oarfish output will be used.
+
+The  output files are read using tximport function, which imports the transcript-level counts and summarizes them per gene using a transcript to gene mapping file.
+
+```{r tximport}
+input_files<-unlist(snakemake@params[["input_files"]])
+names(input_files)<-gsub(".quant","",basename(input_files))
+tx2gene_file<-snakemake@input[["t2g"]]
+tx2gene<-read.delim(tx2gene_file,header=FALSE)[,c(1,2)]
+txi<-tximport(input_files, type = "oarfish", tx2gene = tx2gene)
+
+```
+
+## Prep the DGEList object
+
+
+```{r sampleinfo}
+data.counts<-txi$counts
+write.table(data.counts,"data.counts.tsv",sep="\t",quote=FALSE)
+
+sampleSheet<-snakemake@input[["sampleSheet"]]
+sampleInfo<-read.table(sampleSheet,sep="\t",header=TRUE)
+rownames(sampleInfo)<-sampleInfo$name
+write.table(sampleInfo,"sampleInfo.tsv",sep="\t",quote=FALSE)
+data.counts<-data.counts[,sampleInfo$name]
+
+
+y <- DGEList(counts = data.counts, samples = sampleInfo, group = sampleInfo$condition)
+
+head(y$genes)
+
+#gtf_file<-snakemake@input[["gtf_file"]]
+#gtf<-rtracklayer::import(gtf_file)
+head(y$genes)
+```
+
+
+## Filter and normalize
+
+Lowly expressed genes are filtered out prior to the downstream analysis.   
+
+Scaling factors can computed using the TMM method to convert the resulting library sizes to effective library sizes.   
+
+```{r filt norm}
+keep <- filterByExpr(y)
+table(keep)
+y <- y[keep, , keep.lib.sizes=FALSE]
+
+
+y <- normLibSizes(y)
+y$samples
+```
+
+
+## Calculate MDS
+
+MDS plots can also be used to visualize differences between the expression profiles of different samples with gene-level counts.   
+
+```{r MDS}
+plotMDS(y,col = c(1:2)[y$samples$condition],labels = y$samples$name,xlim = c(-4,4))
+```
+
+## Design matrix
+
+We create the design matrix to compare HEK293 cells against HAP1 cells.   
+
+```{r design}
+design <- model.matrix(~ condition, data = y$samples)
+design
+```
+
+
+## Dispersion estimation
+
+Estimate and visualize NB dispersions.   
+
+```{r disp}
+y <- estimateDisp(y, design, robust=TRUE)
+saveRDS(y,"y.RDS")
+y$common.dispersion
+plotBCV(y)
+
+```
+
+The NB dispersion estimates will not be used further under the latest quasi-likelihood (QL) pipeline.   
+For DGE analyses, we're going to use the quasi-likelihood (QL) pipeline for stricter error rate control by accounting for the uncertainty associated with the dispersion estimation.
+
+```{r quasil}
+fit <- glmQLFit(y, design, robust=TRUE)
+plotQLDisp(fit)
+```
+
+## Differential expression
+
+Differentially expressed genes are tested between cell lines using the QL F-test.
+
+```{r dte}
+qlf <- glmQLFTest(fit)
+is.de <- decideTests(qlf, p.value=0.05)
+summary(is.de)
+
+
+tt <- as.data.frame(topTags(qlf, n = Inf,p.value=0.05))
+head(tt)
+length(unique(tt$gene_id))
+```
+## MA plot
+
+```{r ma}
+plotMD(qlf)
+```
+
+## Save and export results
+
+```{r save}
+saveRDS(qlf,"qlf.RDS")
+write.table(tt,"topTags_pval0.05.tsv",sep="\t",quote=FALSE)
+
+```
+
+
+## Session Info
+
+```{r session info}
+sessionInfo()
+```
+
+
@@ -0,0 +1,162 @@
+---
+title: "edgeR"
+author: "Katarzyna Sikora"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+wd<-file.path(snakemake@params[["basedir"]],snakemake@params[["outdir"]])
+system(paste0('mkdir -p ',wd))
+knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_knit$set(root.dir = wd)
+```
+
+## Libs
+
+```{r libs}
+library(tximport)
+library(edgeR)
+library(GenomicRanges)
+```
+
+## Import counts and summarize gene-wise
+
+Original oarfish output will be used.
+
+The  output files are read using tximport function, which imports the transcript-level counts and summarizes them per gene using a transcript to gene mapping file.
+
+```{r tximport}
+input_files<-unlist(snakemake@params[["input_files"]])
+names(input_files)<-gsub(".quant","",basename(input_files))
+tx2gene_file<-snakemake@input[["t2g"]]
+tx2gene<-read.delim(tx2gene_file,header=FALSE)[,c(1,2)]
+txi<-tximport(input_files, type = "oarfish", tx2gene = tx2gene)
+
+```
+
+## Prep the DGEList object
+
+```{r sampleinfo}
+data.counts<-txi$counts
+write.table(data.counts,"data.counts.tsv",sep="\t",quote=FALSE)
+
+sampleSheet<-snakemake@input[["sampleSheet"]]
+sampleInfo<-read.table(sampleSheet,sep="\t",header=TRUE)
+sampleInfo_h1<-sampleInfo
+sampleInfo_h2<-sampleInfo
+sampleInfo_h1$allele<-"h1"
+sampleInfo_h2$allele<-"h2"
+allelic_sampleInfo<-as.data.frame(rbind(sampleInfo_h1,sampleInfo_h2))
+#rownames(sampleInfo)<-sampleInfo$name
+allelic_sampleInfo$unique_name<-with(allelic_sampleInfo,paste0(name,"_",allele))
+
+write.table(allelic_sampleInfo,"allelic_sampleInfo.tsv",sep="\t",quote=FALSE)
+data.counts<-data.counts[,allelic_sampleInfo$unique_name]
+
+
+y <- DGEList(counts = data.counts, samples = allelic_sampleInfo, group = allelic_sampleInfo$allele)
+
+head(y$genes)
+
+#gtf_file<-snakemake@input[["gtf_file"]]
+
+
+#gtf<-rtracklayer::import(gtf_file)
+
+#ginfo <- mcols(gtf)[match(rownames(y),mcols(gtf)$transcript_id),c("transcript_type","gene_id","gene_name")]
+#y$genes <- cbind(y$genes,ginfo)
+#head(y$genes)
+```
+
+
+## Filter and normalize
+
+Lowly expressed genes are filtered out prior to the downstream analysis.   
+Scaling factors can computed using the TMM method to convert the resulting library sizes to effective library sizes.
+
+```{r filt norm}
+keep <- filterByExpr(y)
+table(keep)
+y <- y[keep, , keep.lib.sizes=FALSE]
+
+
+y <- normLibSizes(y)
+y$samples
+```
+
+
+## Calculate MDS
+
+MDS plots can also be used to visualize differences between the expression profiles of different samples with gene-level counts.
+
+```{r MDS}
+plotMDS(y,col = c(1:2)[y$samples$allele],labels = y$samples$unique_name,xlim = c(-4,4))
+```
+
+## Design matrix
+
+We create the design matrix to compare HEK293 cells against HAP1 cells.   
+
+```{r design}
+design <- model.matrix(~ allele, data = y$samples)
+design
+```
+
+
+## Dispersion estimation
+
+Estimate and visualize NB dispersions.   
+
+```{r disp}
+y <- estimateDisp(y, design, robust=TRUE)
+saveRDS(y,"y.RDS")
+y$common.dispersion
+plotBCV(y)
+
+```
+
+The NB dispersion estimates will not be used further under the latest quasi-likelihood (QL) pipeline.   
+For DGE analyses, we're going to use the quasi-likelihood (QL) pipeline for stricter error rate control by accounting for the uncertainty associated with the dispersion estimation.
+
+```{r quasil}
+fit <- glmQLFit(y, design, robust=TRUE)
+plotQLDisp(fit)
+```
+
+## Differential expression
+
+Differentially expressed genes are tested between cell lines using the QL F-test.
+
+```{r dte}
+qlf <- glmQLFTest(fit)
+is.de <- decideTests(qlf, p.value=0.05)
+summary(is.de)
+
+
+tt <- as.data.frame(topTags(qlf, n = Inf,p.value=0.05))
+head(tt)
+table(tt$transcript_type)
+length(unique(tt$gene_id))
+```
+## MA plot
+
+```{r ma}
+plotMD(qlf)
+```
+
+## Save and export results
+
+```{r save}
+saveRDS(qlf,"qlf.RDS")
+write.table(tt,"topTags_pval0.05.tsv",sep="\t",quote=FALSE)
+
+```
+
+
+## Session Info
+
+```{r session info}
+sessionInfo()
+```
+
+