+ r_plotting_script = 'opt <- c()\nopt$input_folder = workdir\nopt$output_file =paste0(workdir, "/QC_plots.pdf")\n\n#load fastQC summaries and create per test table\ninp <- list.files(opt$input_folder, pattern = ".zip")\n\n\nfastqc_results <- lapply(inp, function(k){\n unzip(paste(opt$input_folder, k, sep = "/"),exdir = opt$input_folder)\n inpu <- read_delim(paste(paste(gsub(".zip", "", paste(opt$input_folder,k, sep = "/"))), \n "summary.txt", sep = "/"), delim = "\t")\n out <- as_data_frame(t(inpu[, 1])) %>%\n mutate(sample.id = names(inpu)[3])\n names(out) <- c(gsub(" ", "_", unlist(inpu[,2])), "sample_id")\n unlink(x = paste(opt$input_folder, gsub(".zip", "", k), sep = "/"), \n recursive = T, force = T)\n \n return(out)\n})\n\nn_cols = length(fastqc_results[[1]])\nif (n_cols==11){\n outp <- do.call("rbind.data.frame", fastqc_results)%>%\n select(ID = sample_id,\n PBQ = Per_base_sequence_quality,\n PTQ = Per_tile_sequence_quality,\n PSQ = Per_sequence_quality_scores,\n PBC = Per_base_sequence_content,\n SGC = Per_sequence_GC_content,\n PBN = Per_base_N_content,\n SLD = Sequence_Length_Distribution,\n SDL = Sequence_Duplication_Levels,\n ORS = Overrepresented_sequences,\n AdC = Adapter_Content)\n #change table format\n ret <- outp %>% \n group_by(ID) %>%\n gather(test, status, PBQ:AdC)\n\n}\nif (n_cols==12){\n outp <- do.call("rbind.data.frame", fastqc_results)%>%\n select(ID = sample_id,\n PBQ = Per_base_sequence_quality,\n PTQ = Per_tile_sequence_quality,\n PSQ = Per_sequence_quality_scores,\n PBC = Per_base_sequence_content,\n SGC = Per_sequence_GC_content,\n PBN = Per_base_N_content,\n SLD = Sequence_Length_Distribution,\n SDL = Sequence_Duplication_Levels,\n ORS = Overrepresented_sequences,\n AdC = Adapter_Content,\n KmC = Kmer_Content)\n #change table format\n ret <- outp %>% \n group_by(ID) %>%\n gather(test, status, PBQ:KmC)\n}\n\n#plot how many samples failed the test\nqc.fail <- ggplot()+\n geom_bar(data = ret, aes(x = test, fill = status), stat = "count", position = "dodge")+\n theme_bw()\n\n#plot which sample failed which test\nqc.samples <- ggplot()+\n geom_tile(data = ret, aes(y = ID, x = test, fill = as.factor(status)))+\n scale_fill_discrete(name = "status")+\n xlab("FastQC test")+\n ylab("Samples")+\n theme_bw()+\n theme(\n axis.text.y = element_blank()\n )\n\n#plot pdf\npdf(opt$output_file)\nprint(qc.fail)\nprint(qc.samples)\ndev.off()\n\npng(gsub(".pdf", "1.png", opt$output_file))\nprint(qc.fail)\ndev.off()\n\npng(gsub(".pdf", "2.png", opt$output_file))\nprint(qc.samples)\ndev.off()\n\n#table with samples that faild a test\nfail <- ret %>%\n filter(status == "FAIL")\n\n#get the ID number of the failed samples\nfail.samp <- fail %>%\n filter(!duplicated(ID)) %>%\n select(ID)%>%\n unlist() %>%\n parse_number()%>%\n unique() %>%\n sort()'
0 commit comments