1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 import re
28
29 nb_ngrams = 400
30
31
33
35 if isinstance(arg, basestring):
36 self.addText(arg)
37 self.normalise()
38 elif isinstance(arg, dict):
39 self.ngrams = arg
40 self.normalise()
41 else:
42 self.ngrams = dict()
43
44 - def addText(self, text):
45 if isinstance(text, str):
46 text = text.decode('utf-8')
47
48 ngrams = dict()
49
50 text = text.replace('\n', ' ')
51 text = re.sub('\s+', ' ', text)
52 words = text.split(' ')
53
54 for word in words:
55 word = '_' + word + '_'
56 size = len(word)
57 for i in xrange(size):
58 for s in (1, 2, 3, 4):
59 sub = word[i:i + s]
60 if not ngrams.has_key(sub):
61 ngrams[sub] = 0
62 ngrams[sub] += 1
63
64 if i + s >= size:
65 break
66 self.ngrams = ngrams
67 return self
68
75
77 count = 0
78 ngrams = {}
79 for v, k in self.sorted():
80 ngrams[k] = count
81 count += 1
82
83 self.ngrams = ngrams
84 return self
85
87 self.ngrams[key] = value
88 return self
89
91 d = 0
92 ngrams = ngram.ngrams
93 for k in self.ngrams.keys():
94 if ngrams.has_key(k):
95 d += abs(ngrams[k] - self.ngrams[k])
96 else:
97 d += nb_ngrams
98 return d
99
100
101 import os
102 import glob
103
104
106
108 self.ngrams = dict()
109 folder = os.path.join(folder, '*' + ext)
110 size = len(ext)
111 count = 0
112
113 for fname in glob.glob(os.path.normcase(folder)):
114 count += 1
115 lang = os.path.split(fname)[-1][:-size]
116 ngrams = {}
117 lines = open(fname, 'r').readlines()
118
119 try:
120 i = len(lines)
121 for line in lines:
122 line = line.decode('utf-8')
123 parts = line[:-1].split()
124 if len(parts) != 2:
125 try:
126 ngrams[parts[0]] = i
127 except IndexError:
128
129 pass
130 else:
131 ngrams[parts[0]] = int(parts[1])
132 i -= 1
133 except UnicodeDecodeError, e:
134 continue
135
136 if ngrams:
137 self.ngrams[lang] = _NGram(ngrams)
138
139 if not count:
140 raise ValueError("no language files found")
141
159
160
162
163 - def __init__(self, folder, ext='.txt'):
164 self.ngrams = dict()
165 folder = os.path.join(folder, '*' + ext)
166 size = len(ext)
167
168 for fname in glob.glob(os.path.normcase(folder)):
169 lang = os.path.split(fname)[-1][:-size]
170 n = _NGram()
171
172 file = open(fname, 'r')
173 for line in file.readlines():
174 n.addText(line)
175 file.close()
176
177 n.normalise()
178 self.ngrams[lang] = n
179
180 - def save(self, folder, ext='.lm'):
181 for lang in self.ngrams.keys():
182 fname = os.path.join(folder, lang + ext)
183 file = open(fname, 'w')
184 for v, k in self.ngrams[lang].sorted():
185 file.write("%s\t %d\n" % (k, v))
186 file.close()
187
188 if __name__ == '__main__':
189 import sys
190
191
192
193
194
195 text = sys.stdin.readline()
196 from translate.misc.file_discovery import get_abs_data_filename
197 l = NGram(get_abs_data_filename('langmodels'))
198 print l.classify(text)
199