Update make_kn_lm.py

Fixed issue #163
This commit is contained in:
huangruizhe 2022-01-02 00:14:27 -08:00 committed by GitHub
parent ea8af0ee9a
commit 49aab7e658
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -165,7 +165,9 @@ class NgramCounts:
n1 += stat[1]
n2 += stat[2]
assert n1 + 2 * n2 > 0
self.d.append(n1 * 1.0 / (n1 + 2 * n2))
self.d.append(max(0.001, n1 * 1.0) / (n1 + 2 * n2)) # We are doing this max(0.001, xxx) to avoid zero discounting constant D,
# which could happen if the number of symbols is small and all w in the vocab
# has been seen after certain h. This can cause division by zero in computing BOW.
def cal_f(self):
# f(a_z) is a probability distribution of word sequence a_z.