# load library suppressMessages(library(pscl)) suppressMessages(library(MASS)) args <- commandArgs(trailingOnly=TRUE) # check arguments if (length(args) != 3 && length(args) != 4) { stop( "Need three or four arguments:\n", " 1. (required) the local feature file with five columns: bin_start bin_end density GC mappability.\n", " 2. (required) the Hi-C contact matrix: n-by-n and symmetric.\n", " 3. (required) the output file for normalized Hi-C matrix.\n", " 4. (optional) the method for modeling Hi-C data: Poisson, NB, ZIP, ZINB, PH, NBH (default).\n", call. = FALSE ) } # read data dat_feature <-read.table(args[1]) names(dat_feature) <- c("binStart", "binEnd", "density", "GC", "map") dat_HiC <-read.table(args[2]) if (nrow(dat_feature) != nrow(dat_HiC)) { stop("The number of rows from the first two input files is not the same.\n", call. = FALSE) } # determine the method output_file <- args[3] methods <- c("Poisson", "NB", "ZIP", "ZINB", "PH", "NBH") method <- "NBH" if (length(args) == 4 && args[4] %in% methods) { method <- args[4] } # tidy data frame: remove rows rows_del_feature <- which(dat_feature$density == 0 | dat_feature$GC == 0 | dat_feature$map == 0) rows_del_HiC <- which(apply(dat_HiC, 1, sum) == 0) rows_del <- c(rows_del_feature, rows_del_HiC) rows_del <- rows_del[!duplicated(rows_del)] dat_feature <- dat_feature[-rows_del,] dat_HiC <- dat_HiC[-rows_del,-rows_del] if (length(rows_del) > 0) { cat("The following rows are removed: ", rows_del, ".\n") } mat_HiC <- as.matrix(dat_HiC) vec_HiC <- mat_HiC[upper.tri(mat_HiC, diag=F)] # get the feature matrix and Z-score mat_density <- as.matrix(log(dat_feature[, 3] %o% dat_feature[, 3])) mat_GC <- as.matrix(log(dat_feature[, 4] %o% dat_feature[, 4])) mat_map <- as.matrix(log(dat_feature[, 5] %o% dat_feature[, 5])) mat_density <- (mat_density - mean(c(mat_density))) / sd(c(mat_density)) mat_GC <- (mat_GC - mean(c(mat_GC))) / sd(c(mat_GC)) vec_density <- mat_density[upper.tri(mat_density, diag=F)] vec_GC <- mat_GC[upper.tri(mat_GC, diag=F)] vec_map <- mat_map[upper.tri(mat_map, diag=F)] fit_model <- NULL if (method == "Poisson") { cat("Using Poisson. \n") fit_model <- glm(vec_HiC ~ vec_density + vec_GC + offset(vec_map), family="poisson") } else if (method == "NB") { cat("Using Negative Binomial. \n") fit_model <- glm.nb(vec_HiC ~ vec_density + vec_GC + offset(vec_map)) } else if (method == "ZIP") { cat("Using Zero-inflated Poisson. \n") fit_model <- zeroinfl(vec_HiC ~ vec_density + vec_GC + offset(vec_map) | vec_density + vec_GC + vec_map, dist="poisson") } else if (method == "ZINB") { cat("Using Zero-inflated Negative Binomial. \n") fit_model <- zeroinfl(vec_HiC ~ vec_density + vec_GC + offset(vec_map) | vec_density + vec_GC + vec_map, dist="negbin") } else if (method == "PH") { cat("Using Poisson Hurdle. \n") fit_model <- hurdle(vec_HiC ~ vec_density + vec_GC + offset(vec_map) | vec_density + vec_GC + vec_map) } else if (method == "NBH") { cat("Using Negative Ninomial Hurdle. \n") fit_model <- hurdle(vec_HiC ~ vec_density + vec_GC + offset(vec_map) | vec_density + vec_GC + vec_map, dist="negbin") } else { stop("Please select a valid method.\n", call. = FALSE) } # output normalized matrix fitted_vals <- fit_model$fitted.values fitted_vals_mat <- matrix(0, nrow(mat_HiC), ncol(mat_HiC)) fitted_vals_mat[upper.tri(fitted_vals_mat, diag=FALSE)] = fitted_vals fitted_vals_mat <- t(fitted_vals_mat) fitted_vals_mat[upper.tri(fitted_vals_mat, diag=FALSE)] = fitted_vals res <- round(mat_HiC / fitted_vals_mat, 4) diag(res) <- 0 write.table(res, file=output_file, row.names=F, col.names=F, sep=" ", quote=F) # End