# load library suppressMessages(library(pscl)) suppressMessages(library(MASS)) args <- commandArgs(trailingOnly=TRUE) # check arguments if (length(args) != 4 && length(args) != 5) { stop( "Need four or five arguments:\n", " 1. (required) the local feature file for chromosome i with five columns: bin_start bin_end density GC mappability.\n", " 2. (required) the local feature file for chromosome j with five columns: bin_start bin_end density GC mappability.\n", " 3. (required) the Hi-C contact matrix between chromosomes i and j.\n", " 4. (required) the output file for normalized Hi-C matrix.\n", " 5. (optional) the method for modeling Hi-C data: Poisson, NB, ZIP, ZINB, PH, NBH (default).\n", call. = FALSE ) } # read data dat_feature1 <-read.table(args[1]) names(dat_feature1) <- c("binStart", "binEnd", "density", "GC", "map") dat_feature2 <-read.table(args[2]) names(dat_feature2) <- c("binStart", "binEnd", "density", "GC", "map") dat_HiC <-read.table(args[3]) if (nrow(dat_feature1) != nrow(dat_HiC) || nrow(dat_feature2) != ncol(dat_HiC)) { stop("The number of rows for the feature files and Hi-C file is not consistent.\n", call. = FALSE) } # determine the method output_file <- args[4] methods <- c("Poisson", "NB", "ZIP", "ZINB", "PH", "NBH") method <- "NBH" if (length(args) == 5 && args[5] %in% methods) { method <- args[5] } # tidy data frame: remove rows rows_del_feature1 <- which(dat_feature1$density == 0 | dat_feature1$GC == 0 | dat_feature1$map == 0) rows_del_feature2 <- which(dat_feature2$density == 0 | dat_feature2$GC == 0 | dat_feature2$map == 0) rows_del_HiC1 <- which(apply(dat_HiC, 1, sum) == 0) rows_del_HiC2 <- unname(which(apply(dat_HiC, 2, sum) == 0)) rows_del1 <- c(rows_del_feature1, rows_del_HiC1) rows_del1 <- sort(rows_del1[!duplicated(rows_del1)]) dat_feature1 <- dat_feature1[-rows_del1,] rows_del2 <- c(rows_del_feature2, rows_del_HiC2) rows_del2 <- sort(rows_del2[!duplicated(rows_del2)]) dat_feature2 <- dat_feature2[-rows_del2,] dat_HiC <- dat_HiC[-rows_del1,-rows_del2] if (length(rows_del1) > 0) { cat("The following rows in the first feature file are removed: ", rows_del1, ".\n") } if (length(rows_del2) > 0) { cat("The following rows in the second feature file are removed: ", rows_del2, ".\n") } mat_HiC <- as.matrix(dat_HiC) vec_HiC <- c(mat_HiC) # get the feature matrix and Z-score mat_density <- as.matrix(log(dat_feature1[, 3] %o% dat_feature2[, 3])) mat_GC <- as.matrix(log(dat_feature1[, 4] %o% dat_feature2[, 4])) mat_map <- as.matrix(log(dat_feature1[, 5] %o% dat_feature2[, 5])) mat_density <- (mat_density - mean(c(mat_density))) / sd(c(mat_density)) mat_GC <- (mat_GC - mean(c(mat_GC))) / sd(c(mat_GC)) vec_density <- c(mat_density) vec_GC <- c(mat_GC) vec_map <- c(mat_map) fit_model <- NULL if (method == "Poisson") { cat("Using Poisson. \n") fit_model <- glm(vec_HiC ~ vec_density + vec_GC + offset(vec_map), family="poisson") } else if (method == "NB") { cat("Using Negative Binomial. \n") fit_model <- glm.nb(vec_HiC ~ vec_density + vec_GC + offset(vec_map)) } else if (method == "ZIP") { cat("Using Zero-inflated Poisson. \n") fit_model <- zeroinfl(vec_HiC ~ vec_density + vec_GC + offset(vec_map) | vec_density + vec_GC + vec_map, dist="poisson") } else if (method == "ZINB") { cat("Using Zero-inflated Negative Binomial. \n") fit_model <- zeroinfl(vec_HiC ~ vec_density + vec_GC + offset(vec_map) | vec_density + vec_GC + vec_map, dist="negbin") } else if (method == "PH") { cat("Using Poisson Hurdle. \n") fit_model <- hurdle(vec_HiC ~ vec_density + vec_GC + offset(vec_map) | vec_density + vec_GC + vec_map) } else if (method == "NBH") { cat("Using Negative Ninomial Hurdle. \n") fit_model <- hurdle(vec_HiC ~ vec_density + vec_GC + offset(vec_map) | vec_density + vec_GC + vec_map, dist="negbin") } else { stop("Please select a valid method.\n", call. = FALSE) } # output normalized matrix mat_expected <- matrix(fit_model$fitted.values, nrow = nrow(mat_HiC)) mat_norm <- round(mat_HiC / mat_expected, 4) write.table(mat_norm, file=output_file, row.names=F, col.names=F, sep=" ", quote=F) # End