ARPA format
How to generate an arpa file
Given the following text,
a b c d e
d e f a
a b c d e f a
we can convert it to a n-gram LM saved in ARPA format using the script shared/make_kn_lm.py:
cat > test.txt <<EOF
a b c d e
d e f a
a b c d e f a
EOF
wget https://raw.githubusercontent.com/k2-fsa/icefall/master/icefall/shared/make_kn_lm.py
python3 ./make_kn_lm.py -ngram-order 3 -text ./test.txt -lm test.arpa
The content of test.arpa
is given below:
\data\
ngram 1=8
ngram 2=10
ngram 3=9
\1-grams:
-99.0000000 <s> -0.8573325
-0.6989700 a -0.7481880
-1.0000000 b -0.8573325
-1.0000000 c -0.8061800
-0.6989700 d -1.1583625
-1.0000000 e -0.7481880
-0.6989700 </s>
-1.0000000 f -0.8061800
\2-grams:
-0.2041200 <s> a -0.9542425
-0.5351132 <s> d 0.3010300
-0.3590219 a b -0.3010300
-0.3590219 a </s>
-0.0579919 b c -0.3010300
-0.0579919 c d 0.0000000
-0.0280287 d e -0.1760913
-0.3590219 e </s>
-0.3590219 e f -0.3010300
-0.0579919 f a -0.9542425
\3-grams:
-0.0280287 <s> a b
-0.0280287 a b c
-0.0280287 b c d
-0.0280287 c d e
-0.5351132 d e </s>
-0.2041200 d e f
-0.0579919 <s> d e
-0.0280287 e f a
-0.0280287 f a </s>
\end\
How to interpret an arpa file
\data\
ngram 1=8
ngram 2=10
ngram 3=9
An arpa file begins with the literal string \data\
in the first line.
The lines that follow it contain the number of entries for each order:
ngram 1=8
: There are 8 entries for unigram
ngram 2=10
: There are 10 entries for bigram
ngram 3=9
: There are 9 entries for trigram. The highest order of this file is 3.
\1-grams:
-99.0000000 <s> -0.8573325
-0.6989700 a -0.7481880
-1.0000000 b -0.8573325
-1.0000000 c -0.8061800
-0.6989700 d -1.1583625
-1.0000000 e -0.7481880
-0.6989700 </s>
-1.0000000 f -0.8061800
\1-grams:
means the following entries belong to unigram.
Each entry of unigram has 2 or 3 columns.
Column 0: probability in \(\log_{10}\), i.e., \(\log_{10}(p)\)
Column 1: the word
Column 2: back-off probability in \(\log_{10}\), If this column is absent, it is \(\log_{10}(1) = 0\) by default
Caution
How to use score a sentence using arpa
Example 1: p(a | <s>)
\[\log_{10} \mathrm{p}(\mathrm{a}|\lt\!\mathrm{s}\!\gt) = -0.2041200\]
We can read the value of \(\log_{10} \mathrm{p}(\mathrm{a}|\lt\!\mathrm{s}\!\gt)\) directly from the arpa file.
Example 2: p(a b | <s>)
Example 3: p(a b </s> | <s>)
Caution
The arpa file does not contain a b </s>
, so when computing
\(\log_{10}\mathrm{p}(\lt\!/\mathrm{s}\!\gt|\mathrm{a}\;\mathrm{b})\), we use
Similary, b </s>
also does not exist in the arpa file, we use:
Example 4: p(b d </s> | <s>)
Caution
There is no <s> b
in the arpa file, so when computing
\(\log_{10} \mathrm{p}(\mathrm{b}|\lt\!\mathrm{s}\!\gt)\), we use
\[\log_{10} \mathrm{p}(\mathrm{b}|\lt\!\mathrm{s}\!\gt) = \log_{10}p_{\mathrm{backoff}}(\mathrm{\lt\!\mathrm{s}\!\gt}) + \log_{10} \mathrm{p}(\mathrm{b})\]
There is no <s> b d
in the arpa file, so when computing
\(\log_{10} \mathrm{p}(\mathrm{b}|\lt\!\mathrm{s}\!\gt)\), we use
\[\begin{split}\log_{10} \mathrm{p}(\mathrm{d}|\lt\!\mathrm{s}\!\gt \mathrm{b}) &= \log_{10}p_{\mathrm{backoff}}(\mathrm{\lt\!\mathrm{s}\!\gt\mathrm{b}}) + \log_{10} \mathrm{p}(\mathrm{d}|\mathrm{b}) \\ &= 0 + \log_{10} \mathrm{p}(\mathrm{d}|\mathrm{b}) \\ &= \log_{10}p_{\mathrm{backoff}}(\mathrm{b}) +\log_{10} \mathrm{p}(\mathrm{d}) \\\end{split}\]
How to use kenLM to compute scores
First, let us install kenlm
with the following command:
pip install https://github.com/kpu/kenlm/archive/master.zip
1#!/usr/bin/env python3
2
3import kenlm
4
5
6def test():
7 # Note: When we use p(x), we are actually referring to log10(p(x))
8 model = kenlm.LanguageModel("./test.arpa")
9 # model.score() return probability in log10()
10
11 # p("a") in log10
12 assert abs(model.score("a", bos=False, eos=False) - (-0.6989700)) < 1e-5
13
14 # p(a|<s>) in log10
15 assert abs(model.score("a", eos=False) - (-0.2041200)) < 1e-5
16
17 # p(a b | <s>)
18 # = p(a | <s>) + p(b | <s> a)
19 # = (-0.204120) + (-0.0280287)
20 # = -0.23214869
21 # print(model.score("a b", eos=False, bos=True))
22 assert abs(model.score("a b", eos=False, bos=True) - (-0.23214869)) < 1e-5
23
24 # p(a b </s> | <s>)
25 # = p(a | <s>) + p(b | <s> a) + p(</s> | a b)
26 # = (-0.204120) + (-0.0280287) + backoff(a b) + p(</s> | b)
27 # = (-0.204120) + (-0.0280287) + backoff(a b) + p(</s> | b)
28 # = (-0.204120) + (-0.0280287) + (-0.3010300) + backoff(b) + p(</s>)
29 # = (-0.204120) + (-0.0280287) + (-0.3010300) + (-0.8573325) + (-0.6989700)
30 # = -2.0894812
31 # print(model.score("a b", eos=True, bos=True))
32 assert abs(model.score("a b", eos=True, bos=True) - (-2.0894812)) < 1e-5
33 # Pay attention to the computation of p(</s> | a b)
34 # p(</s> | a b)
35 # = backoff (a b) + p (</s> | b)
36 # = backoff (a b) + backoff(b) + p(</s>)
37 #
38 # Also note that p(</s>) is 0
39
40 # p(b d </s> | <s>)
41 # = p(b | <s>) + p (d | <s> b) + p(</s> | b d)
42 # = backoff(<s>) + p(b) + p (d | <s> b) + p(</s> | b d)
43 # = backoff(<s>) + p(b) + backoff(<s> b) + p(d | b) + p(</s> | b d)
44 # = backoff(<s>) + p(b) + backoff(<s> b) + backoff(b) + p(d) + p(</s> | b d)
45 # = backoff(<s>) + p(b) + backoff(<s> b) + backoff(b) + p(d) + backoff(b d) + p(</s> | d)
46 # = backoff(<s>) + p(b) + backoff(<s> b) + backoff(b) + p(d) + backoff(b d) + backoff(d) + p(</s>)
47 # = (-0.8573325) + (-1.0000000) + 0 + (-0.8573325) + (-0.6989700) + 0 + (-1.1583625) + (-0.6989700)
48 # = -5.2709675
49 # print(model.score("b d", eos=True, bos=True))
50 assert abs(model.score("b d", eos=True, bos=True) - (-5.2709675)) < 1e-5
51 print(model.score("b d", eos=True, bos=True))
52 print(list(model.full_scores("b d", eos=True, bos=True)))
53 # Note:
54 # p(b | <s>) = backoff(<s>) + p(b)
55 #
56 # p(d | <s> b) = backoff(<s> b) + p (d | b)
57 # since backoff (<s> b) does not exist, so it is 0
58 #
59 # p(d | b) = backoff(b) + p(d)
60 #
61 # p (</s> | b d) = backoff(b d) + p(</s> | d)
62 # = backoff(b d) + backoff(d) + p(</s>)
63
64
65def test_statefull():
66 model = kenlm.LanguageModel("./test.arpa")
67 s1 = kenlm.State()
68 s2 = kenlm.State()
69 model.BeginSentenceWrite(s1)
70 accum = model.BaseScore(s1, "a", s2) # p(a | <s>)
71 # print(accum) # -0.2041200
72 assert abs(accum - model.score("a", bos=True, eos=False)) < 1e-5
73 accum += model.BaseScore(s2, "b", s1) # p(a | <s>) + p(b | <s> a)
74 # print(accum) # -0.23214869
75 assert abs(accum - model.score("a b", bos=True, eos=False)) < 1e-5
76
77 # reset
78 s1 = kenlm.State()
79 s2 = kenlm.State()
80 model.BeginSentenceWrite(s1)
81 accum = model.BaseScore(s1, "b", s2) # p(b | <s>)
82 # print(accum) # -1.857332
83 assert abs(accum - model.score("b", bos=True, eos=False)) < 1e-5
84 # backoff(<s>) + p(b) = -0.8573325 + (-1) = -1.8573325
85 accum += model.BaseScore(s2, "c", s1) # p(b | <s>) + p(c | b)
86 # print(accum) # -1.91532436
87 # p(b | <s>) + p(c | b) = -1.8573325 + (-0.0579919) = -1.9153244
88 assert abs(accum - model.score("b c", bos=True, eos=False)) < 1e-5
89
90 accum += model.BaseScore(s1, "d", s2) # p(b | <s>) + p(c | b) + p(d | b c)
91 # print(accum)
92 # p(b | <s>) + p(c | b) + p(d | b c) = -1.9153244 + (-0.0280287) = -1.9433531
93 assert abs(accum - model.score("b c d", bos=True, eos=False)) < 1e-5
94
95 # now for oov
96 # reset
97 s1 = kenlm.State()
98 s2 = kenlm.State()
99 model.BeginSentenceWrite(s1)
100 accum = model.BaseScore(s1, "g", s2) # p(g | <s>)
101 # p(g | <s>) = backoff(<s>) + p(<unk>) = -0.8573325 + (-100) = -100.8573325
102 print(accum) # -100.8573325
103 for i in ["a", "b", "c", "d", "e", "f", "</s>"]:
104 assert (
105 abs(model.BaseScore(s2, i, s1) - model.score(i, bos=False, eos=False))
106 < 1e-5
107 )
108 # print(model.BaseScore(s2, "kk", s1)) # -100
109 accum += model.BaseScore(s2, "a", s1) # p(g | <s>) + p(a)
110 print(accum)
111 assert abs(accum - model.score("g a", bos=True, eos=False)) < 1e-5
112 accum += model.BaseScore(s1, "b", s2) # p(g | <s>) + p(a) + p(b|a)
113 print(accum)
114
115
116def main():
117 test()
118 test_statefull()
119
120
121if __name__ == "__main__":
122 main()