library(ggplot2) library(scales) library(gridExtra) library(grid) ################## ### BAR COLORS ### ################## gg_color_hue <- function(n) { hues = seq(15, 375, length = n + 1) hcl(h = hues, l = 65, c = 100)[1:n] } cols = gg_color_hue(6) uncorrected = cols[1] hapog_20x = cols[2] hapog_50x = cols[3] hapog_100x = cols[4] hapog_150x = cols[5] hapog_180x = cols[6] ####################### ### DATA FORMATTING ### ####################### dat = read.csv('Arabidopsis_coverage_variation.csv') dat$Coverage = factor(dat$Coverage, levels=c("Uncorrected", "20x", "50x", "100x", "150x", "180x")) ##################### ### IDENTITY RATE ### ##################### id_rate_top = ggplot(data=dat, aes(x=Coverage, y=percent_identity)) + geom_col(position=position_dodge2(width = 0.9, preserve = "single"), color="black", fill=c(uncorrected, hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none")+ xlab("") + ylab("") + scale_y_continuous(breaks=c(0, 99.90, 99.902, 99.904, 99.906, 99.908, 99.91), labels=c("0", "99.900", "99.902", "99.904", "99.906", "99.908", "99.910")) + coord_cartesian(ylim=c(99.9, 99.91)) id_rate_bottom = ggplot(data=dat, aes(x=Coverage, y=percent_identity)) + geom_col(position=position_dodge2(width = 0.9, preserve = "single"), color="black", fill=c(uncorrected, hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none")+ xlab("Illumina coverage used to polish with Hapo-G") + ylab("") + scale_y_continuous(breaks=c(0, 25, 50, 75, 100), labels=c("0", "25.00", "50.00", "75.00", "100")) id_rate = grid.arrange(id_rate_top, id_rate_bottom, heights=c(2, 3), ncol=1, nrow=2, left=textGrob("Identity rate(%)", rot=90, vjust=1)) ##################### ### QUALITY SCORE ### ##################### qscore_top = ggplot(data=dat, aes(x=Coverage, y=Qscore)) + geom_histogram(stat="identity", position=position_dodge2(width = 0.9, preserve = "single"), color="black", fill=c(uncorrected, hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none") + xlab("") + ylab("") + coord_cartesian(ylim=c(30, 32)) qscore_bottom = ggplot(data=dat, aes(x=Tool, y=Qscore)) + geom_histogram(stat="identity", position=position_dodge2(width = 0.9, preserve = "single"), color="black", fill=c(uncorrected, hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none") + xlab("Illumina coverage used to polish with Hapo-G") + ylab("") + scale_y_continuous(breaks=c(0, 10, 20, 30), labels=c("0", "10.0", "20.0", "30.0")) qscore = grid.arrange(qscore_top, qscore_bottom, heights=c(2, 3), ncol=1, nrow=2, left=textGrob("Quality score", rot=90, vjust=1)) ggsave("Arabidopsis_qscore.png", plot=qscore, width=16, height=9) ########################### ### 100% identity pairs ### ########################### pairs_top = ggplot(data=dat, aes(x=Coverage, y=X100p_pairs)) + geom_histogram(stat="identity", position=position_dodge2(width = 0.9, preserve = "single"), color="black", fill=c(uncorrected, hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none") + xlab("") + ylab("") + scale_y_continuous(breaks=c(42500000, 43000000, 43500000), labels=c("42.5", "43.0", "43.5")) + coord_cartesian(ylim=c(42500000, 43500000)) pairs_bottom = ggplot(data=dat, aes(x=Coverage, y=X100p_pairs)) + geom_histogram(stat="identity", position=position_dodge2(width = 0.9, preserve = "single"), color="black", fill=c(uncorrected, hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none") + xlab("Illumina coverage used to polish with Hapo-G") + ylab("") + scale_y_continuous(breaks=c(0, 10000000, 20000000, 30000000, 40000000, 50000000), labels=c("0", "10.0", "20.0", "30.0", "40.0", "50.0")) pairs = grid.arrange(pairs_top, pairs_bottom, heights=c(2, 3), ncol=1, nrow=2, left=textGrob("Million of perfectly mapped pairs", rot=90, vjust=1)) ############# ### EXONS ### ############# exons_top = ggplot(data=dat, aes(x=Coverage, y=X100p_exons)) + geom_histogram(stat="identity", position=position_dodge2(width = 0.9, preserve = "single"), color="black", fill=c(uncorrected, hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size=12),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none") + xlab("") + ylab("") + scale_y_continuous(label=comma, breaks=c(198500, 199000, 199500, 200000)) + coord_cartesian(ylim=c(198500, 200000)) exons_bottom = ggplot(data=dat, aes(x=Coverage, y=X100p_exons)) + geom_histogram(stat="identity", position=position_dodge2(width = 0.9, preserve = "single"), color="black", fill=c(uncorrected, hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size=12),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none") + xlab("Illumina coverage used to polish with Hapo-G") + ylab("") + scale_y_continuous(label=comma, breaks=c(0, 50000, 100000, 150000, 200000)) + coord_cartesian(ylim=c(0, 200000)) exons = grid.arrange(exons_top, exons_bottom, heights=c(2, 3), ncol=1, nrow=2, left=textGrob("Number of exons aligned with 100% identity", rot=90, vjust=1)) ################ ### RUN TIME ### ################ dat2 = subset(dat, dat$Tool != "Uncorrected") run_time = ggplot(data=dat2, aes(x=Coverage, y=run_time)) + geom_histogram(stat="identity", position=position_dodge2(preserve = "single"), color="black", fill=c(hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none") + xlab("Illumina coverage used to polish with Hapo-G") + ylab("Total running time (minutes)") + #scale_y_continuous(limits=c(0, 210)) coord_cartesian(ylim=c(0, 50)) id_rate <- arrangeGrob(id_rate, top = textGrob("A", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(col="black", fontsize=18, fontfamily="Times Roman"))) qscore <- arrangeGrob(qscore, top = textGrob("B", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(col="black", fontsize=18, fontfamily="Times Roman"))) pairs <- arrangeGrob(pairs, top = textGrob("B", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(col="black", fontsize=18, fontfamily="Times Roman"))) exons <- arrangeGrob(exons, top = textGrob("C", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(col="black", fontsize=18, fontfamily="Times Roman"))) run_time <- arrangeGrob(run_time, top = textGrob("D", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(col="black", fontsize=18, fontfamily="Times Roman"))) g <- grid.arrange(id_rate, pairs, exons, run_time, ncol=2, nrow=2) ggsave(file="Arabidopsis_coverage_variation_metrics.png", plot=g, width=16, height=9) ################### ### MEMORY PEAK ### ################### dat2 = subset(dat, dat$Tool != "Uncorrected") memory_peak = ggplot(data=dat2, aes(x=Coverage, y=peak_memory.GB.)) + geom_histogram(stat="identity", position=position_dodge2(preserve = "single"), color="black", fill=c(hapog_20x, hapog_50x, hapog_100x, hapog_150x, hapog_180x)) + theme_bw() + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 14),legend.text = element_text(size = 22),plot.title = element_text(size = 22)) + theme(legend.position = "none") + xlab("Illumina coverage used to polish with Hapo-G") + ylab("Memory peak (GB)")+ scale_y_continuous(limits=c(0, 60)) ggsave(file="Arabidopsis_coverage_variation_memory_peak.png", plot=memory_peak, width=16, height=9)