-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfit.R
82 lines (65 loc) · 2.87 KB
/
fit.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# This is an R script to estimate k and p parameters for NBD (Negative Binomial Distribution)
# assuming that word frequency distributions follow NBD
# Usage: Rscript myscript.R /path/to/inputfile.txt /path/to/outputfile.txt
# WARNING! Running this script can be compute-intensive operation, use at your own risk.
# No liability of any kind.
# Load the bbmle library
library("bbmle")
# Check if both input and output file names are provided
if (length(commandArgs(trailingOnly = TRUE)) < 2) {
stop("Please provide both input and output file names as command line arguments.")
}
# Get the input and output file names from the command line
input_file <- commandArgs(trailingOnly = TRUE)[1]
output_file <- commandArgs(trailingOnly = TRUE)[2]
# Read word frequencies from a flat text file
file1 <- read.table(file = input_file, head = TRUE, sep = " ", encoding = "UTF-8", fileEncoding = "UTF-8")
# Transpose the data for easier processing
filet <- t(file1)
# Define the negative log-likelihood function for Negative Binomial Distribution
minuslogl <- function(size, prob, x) {
-sum(dnbinom(x = x, size = size, prob = prob, log = TRUE))
}
# Initialize vectors for size, probability, and sqrt_kp, sum, and document frequency (DF)
vsize <- numeric(0)
vprob <- numeric(0)
sqrt_kp <- numeric(0)
sum_vector <- numeric(0)
doc_freq <- numeric(0)
# Iterate through columns of the transposed data
for (i in c(1:nrow(filet))) {
# Extract numeric data from the current column
file <- as.numeric(filet[i, ])
# Calculate the sum for the current column
sum_vector[i] <- sum(file)
# Calculate the document frequency for the current column
# DF (document frequency) is in how many document a word is found
doc_freq[i] <- sum(file != 0)
# Check if there is variability in the data
if (var(file) == 0) {
vsize[i] <- NaN
vprob[i] <- NaN
sqrt_kp[i] <- NaN
} else {
# Fit Negative Binomial Distribution using maximum likelihood estimation
m <- mle2(
minuslogl = minuslogl,
start = list(size = 0.01, prob = 0.01),
data = list(x = file),
method = "L-BFGS-B",
lower = list(size = 1e-4, prob = 1e-4),
upper = list(size = 1e+5, prob = 0.9999)
)
# Store the estimated parameters and calculate sqrt_kp
vsize[i] <- coef(m)[1]
vprob[i] <- coef(m)[2]
sqrt_kp[i] <- sqrt(vsize[i] * vsize[i] + vprob[i] * vprob[i])
}
}
header="word k p sqrt_kp sum DF"
write(header, file = output_file)
# Create a data frame with word, size, probability, and sqrt_kp
# If you want to print actual word frequencies (such as for diagnositcs, change to this: "filet[, ]" )
d <- data.frame(word = filet[, 0], k = vsize, p = vprob, sqrt_kp = sqrt_kp, sum = sum_vector, df = doc_freq)
# Write the results to a tab-separated file with custom column names in the header
write.table(d, file = output_file, sep = "\t", quote = FALSE, row.names = TRUE, col.names = FALSE, append = TRUE)