Package translate :: Package lang :: Module data
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.data

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2007-2009 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """This module stores information and functionality that relates to plurals.""" 
 23   
 24  import unicodedata 
 25   
 26  from translate.storage.placeables import StringElem 
 27   
 28   
 29  languages = { 
 30  'af': (u'Afrikaans', 2, '(n != 1)'), 
 31  'ak': (u'Akan', 2, 'n > 1'), 
 32  'am': (u'Amharic', 2, 'n > 1'), 
 33  'an': (u'Aragonese', 2, '(n != 1)'), 
 34  'ar': (u'Arabic', 6, 'n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 ? 4 : 5'), 
 35  'arn': (u'Mapudungun; Mapuche', 2, 'n > 1'), 
 36  'ast': (u'Asturian; Bable; Leonese; Asturleonese', 2, '(n != 1)'), 
 37  'az': (u'Azerbaijani', 2, '(n != 1)'), 
 38  'be': (u'Belarusian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 39  'bg': (u'Bulgarian', 2, '(n != 1)'), 
 40  'bn': (u'Bengali', 2, '(n != 1)'), 
 41  'bn_IN': (u'Bengali (India)', 2, '(n != 1)'), 
 42  'bo': (u'Tibetan', 1, '0'), 
 43  'br': (u'Breton', 2, 'n > 1'), 
 44  'bs': (u'Bosnian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 45  'ca': (u'Catalan; Valencian', 2, '(n != 1)'), 
 46  'ca@valencia': (u'Catalan; Valencian (Valencia)', 2, '(n != 1)'), 
 47  'cs': (u'Czech', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'), 
 48  'csb': (u'Kashubian', 3, 'n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 49  'cy': (u'Welsh', 2, '(n==2) ? 1 : 0'), 
 50  'da': (u'Danish', 2, '(n != 1)'), 
 51  'de': (u'German', 2, '(n != 1)'), 
 52  'dz': (u'Dzongkha', 1, '0'), 
 53  'el': (u'Greek, Modern (1453-)', 2, '(n != 1)'), 
 54  'en': (u'English', 2, '(n != 1)'), 
 55  'en_GB': (u'English (United Kingdom)', 2, '(n != 1)'), 
 56  'en_ZA': (u'English (South Africa)', 2, '(n != 1)'), 
 57  'eo': (u'Esperanto', 2, '(n != 1)'), 
 58  'es': (u'Spanish; Castilian', 2, '(n != 1)'), 
 59  'et': (u'Estonian', 2, '(n != 1)'), 
 60  'eu': (u'Basque', 2, '(n != 1)'), 
 61  'fa': (u'Persian', 1, '0'), 
 62  'fi': (u'Finnish', 2, '(n != 1)'), 
 63  'fil': (u'Filipino; Pilipino', 2, '(n > 1)'), 
 64  'fo': (u'Faroese', 2, '(n != 1)'), 
 65  'fr': (u'French', 2, '(n > 1)'), 
 66  'fur': (u'Friulian', 2, '(n != 1)'), 
 67  'fy': (u'Frisian', 2, '(n != 1)'), 
 68  'ga': (u'Irish', 3, 'n==1 ? 0 : n==2 ? 1 : 2'), 
 69  'gl': (u'Galician', 2, '(n != 1)'), 
 70  'gu': (u'Gujarati', 2, '(n != 1)'), 
 71  'gun': (u'Gun', 2, '(n > 1)'), 
 72  'ha': (u'Hausa', 2, '(n != 1)'), 
 73  'he': (u'Hebrew', 2, '(n != 1)'), 
 74  'hi': (u'Hindi', 2, '(n != 1)'), 
 75  'hy': (u'Armenian', 1, '0'), 
 76  'hr': (u'Croatian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
 77  'hu': (u'Hungarian', 2, '(n != 1)'), 
 78  'id': (u'Indonesian', 1, '0'), 
 79  'is': (u'Icelandic', 2, '(n != 1)'), 
 80  'it': (u'Italian', 2, '(n != 1)'), 
 81  'ja': (u'Japanese', 1, '0'), 
 82  'jv': (u'Javanese', 2, '(n != 1)'), 
 83  'ka': (u'Georgian', 1, '0'), 
 84  'kk': (u'Kazakh', 1, '0'), 
 85  'km': (u'Central Khmer', 1, '0'), 
 86  'kn': (u'Kannada', 2, '(n != 1)'), 
 87  'ko': (u'Korean', 1, '0'), 
 88  'ku': (u'Kurdish', 2, '(n != 1)'), 
 89  'kw': (u'Cornish', 4, '(n==1) ? 0 : (n==2) ? 1 : (n == 3) ? 2 : 3'), 
 90  'ky': (u'Kirghiz; Kyrgyz', 1, '0'), 
 91  'lb': (u'Luxembourgish; Letzeburgesch', 2, '(n != 1)'), 
 92  'ln': (u'Lingala', 2, '(n > 1)'), 
 93  'lo': (u'Lao', 1, '0'), 
 94  'lt': (u'Lithuanian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
 95  'lv': (u'Latvian', 3, '(n%10==1 && n%100!=11 ? 0 : n != 0 ? 1 : 2)'), 
 96  'mg': (u'Malagasy', 2, '(n > 1)'), 
 97  'mi': (u'Maori', 2, '(n > 1)'), 
 98  'mk': (u'Macedonian', 2, 'n==1 || n%10==1 ? 0 : 1'), 
 99  'ml': (u'Malayalam', 2, '(n != 1)'), 
100  'mn': (u'Mongolian', 2, '(n != 1)'), 
101  'mr': (u'Marathi', 2, '(n != 1)'), 
102  'ms': (u'Malay', 1, '0'), 
103  'mt': (u'Maltese', 4, '(n==1 ? 0 : n==0 || ( n%100>1 && n%100<11) ? 1 : (n%100>10 && n%100<20 ) ? 2 : 3)'), 
104  'nah': (u'Nahuatl languages', 2, '(n != 1)'), 
105  'nap': (u'Neapolitan', 2, '(n != 1)'), 
106  'nb': (u'Bokmål, Norwegian; Norwegian Bokmål', 2, '(n != 1)'), 
107  'ne': (u'Nepali', 2, '(n != 1)'), 
108  'nl': (u'Dutch; Flemish', 2, '(n != 1)'), 
109  'nn': (u'Norwegian Nynorsk; Nynorsk, Norwegian', 2, '(n != 1)'), 
110  'nso': (u'Pedi; Sepedi; Northern Sotho', 2, '(n > 1)'), 
111  'oc': (u'Occitan (post 1500)', 2, '(n > 1)'), 
112  'or': (u'Oriya', 2, '(n != 1)'), 
113  'pa': (u'Panjabi; Punjabi', 2, '(n != 1)'), 
114  'pap': (u'Papiamento', 2, '(n != 1)'), 
115  'pl': (u'Polish', 3, '(n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
116  'pms': (u'Piemontese', 2, '(n != 1)'), 
117  'ps': (u'Pushto; Pashto', 2, '(n != 1)'), 
118  'pt': (u'Portuguese', 2, '(n != 1)'), 
119  'pt_BR': (u'Portuguese (Brazil)', 2, '(n > 1)'), 
120  'rm': (u'Romansh', 2, '(n != 1)'), 
121  'ro': (u'Romanian', 3, '(n==1 ? 0 : (n==0 || (n%100 > 0 && n%100 < 20)) ? 1 : 2);'), 
122  'ru': (u'Russian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
123  'sco': (u'Scots', 2, '(n != 1)'), 
124  'si': (u'Sinhala; Sinhalese', 2, '(n != 1)'), 
125  'sk': (u'Slovak', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'), 
126  'sl': (u'Slovenian', 4, '(n%100==1 ? 0 : n%100==2 ? 1 : n%100==3 || n%100==4 ? 2 : 3)'), 
127  'so': (u'Somali', 2, '(n != 1)'), 
128  'sq': (u'Albanian', 2, '(n != 1)'), 
129  'sr': (u'Serbian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
130  'st': (u'Sotho, Southern', 2, '(n != 1)'), 
131  'su': (u'Sundanese', 1, '0'), 
132  'sv': (u'Swedish', 2, '(n != 1)'), 
133  'sw': (u'Swahili', 2, '(n != 1)'), 
134  'ta': (u'Tamil', 2, '(n != 1)'), 
135  'te': (u'Telugu', 2, '(n != 1)'), 
136  'tg': (u'Tajik', 2, '(n != 1)'), 
137  'ti': (u'Tigrinya', 2, '(n > 1)'), 
138  'th': (u'Thai', 1, '0'), 
139  'tk': (u'Turkmen', 2, '(n != 1)'), 
140  'tr': (u'Turkish', 1, '0'), 
141  'tt': (u'Tatar', 1, '0'), 
142  'ug': (u'Uighur; Uyghur', 1, '0'), 
143  'uk': (u'Ukrainian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
144  'vi': (u'Vietnamese', 1, '0'), 
145  'wa': (u'Walloon', 2, '(n > 1)'), 
146  # Chinese is difficult because the main divide is on script, not really  
147  # country. Simplified Chinese is used mostly in China, Singapore and Malaysia. 
148  # Traditional Chinese is used mostly in Hong Kong, Taiwan and Macau. 
149  'zh_CN': (u'Chinese (China)', 1, '0'), 
150  'zh_HK': (u'Chinese (Hong Kong)', 1, '0'), 
151  'zh_TW': (u'Chinese (Taiwan)', 1, '0'), 
152  'zu': (u'Zulu', 2, '(n != 1)'), 
153  } 
154  """Dictionary of language data. 
155  The language code is the dictionary key (which may contain country codes and modifiers). 
156  The value is a tuple: (Full name in English from iso-codes, nplurals, plural equation). 
157   
158  Note that the English names should not be used in user facing places - it 
159  should always be passed through the function returned from tr_lang(), or at 
160  least passed through _fix_language_name().""" 
161   
162  _fixed_names = { 
163          u"Asturian; Bable; Leonese; Asturleonese": u"Asturian", 
164          u"Bokmål, Norwegian; Norwegian Bokmål": u"Norwegian Bokmål", 
165          u"Catalan; Valencian": u"Catalan", 
166          u"Central Khmer": u"Khmer", 
167          u"Chichewa; Chewa; Nyanja": u"Chewa; Nyanja", 
168          u"Divehi; Dhivehi; Maldivian": u"Divehi", 
169          u"Dutch; Flemish": u"Dutch", 
170          u"Filipino; Pilipino": u"Filipino", 
171          u"Greek, Modern (1453-)": u"Greek", 
172          u"Kirghiz; Kyrgyz": u"Kirghiz", 
173          u"Klingon; tlhIngan-Hol": u"Klingon", 
174          u"Limburgan; Limburger; Limburgish": u"Limburgish", 
175          u"Low German; Low Saxon; German, Low; Saxon, Low": u"Low German", 
176          u"Luxembourgish; Letzeburgesch": u"Luxembourgish", 
177          u"Ndebele, South; South Ndebele": u"Southern Ndebele", 
178          u"Norwegian Nynorsk; Nynorsk, Norwegian": u"Norwegian Nynorsk", 
179          u"Occitan (post 1500)": u"Occitan", 
180          u"Panjabi; Punjabi": u"Punjabi", 
181          u"Pedi; Sepedi; Northern Sotho": u"Northern Sotho", 
182          u"Pushto; Pashto": u"Pashto", 
183          u"Sinhala; Sinhalese": u"Sinhala", 
184          u"Sotho, Southern": u"Sotho", 
185          u"Spanish; Castilian": u"Spanish", 
186          u"Uighur; Uyghur": u"Uighur", 
187  } 
188   
189 -def simplercode(code):
190 """This attempts to simplify the given language code by ignoring country 191 codes, for example. 192 193 @see: 194 - U{http://www.rfc-editor.org/rfc/bcp/bcp47.txt} 195 - U{http://www.rfc-editor.org/rfc/rfc4646.txt} 196 - U{http://www.rfc-editor.org/rfc/rfc4647.txt} 197 - U{http://www.w3.org/International/articles/language-tags/} 198 """ 199 if not code: 200 return code 201 202 normalized = normalize_code(code) 203 separator = normalized.rfind('-') 204 if separator >= 0: 205 return code[:separator] 206 else: 207 return ""
208 209 210 expansion_factors = { 211 'af': 0.1, 212 'ar': -0.09, 213 'es': 0.21, 214 'fr': 0.28, 215 'it': 0.2, 216 } 217 """Source to target string length expansion factors.""" 218 219 import gettext 220 import locale 221 import re 222 import os 223 224 iso639 = {} 225 """ISO 639 language codes""" 226 iso3166 = {} 227 """ISO 3166 country codes""" 228 229 langcode_re = re.compile("^[a-z]{2,3}([_-][A-Z]{2,3}|)(@[a-zA-Z0-9]+|)$") 230 variant_re = re.compile("^[_-][A-Z]{2,3}(@[a-zA-Z0-9]+|)$") 231
232 -def languagematch(languagecode, otherlanguagecode):
233 """matches a languagecode to another, ignoring regions in the second""" 234 if languagecode is None: 235 return langcode_re.match(otherlanguagecode) 236 return languagecode == otherlanguagecode or \ 237 (otherlanguagecode.startswith(languagecode) and variant_re.match(otherlanguagecode[len(languagecode):]))
238 239 dialect_name_re = re.compile(r"(.+)\s\(([^)\d]+)\)$") 240
241 -def tr_lang(langcode=None):
242 """Gives a function that can translate a language name, even in the form C{"language (country)"}, 243 into the language with iso code langcode, or the system language if no language is specified.""" 244 langfunc = gettext_lang(langcode) 245 countryfunc = gettext_country(langcode) 246 247 def handlelanguage(name): 248 match = dialect_name_re.match(name) 249 if match: 250 language, country = match.groups() 251 return u"%s (%s)" % (_fix_language_name(langfunc(language)), countryfunc(country)) 252 else: 253 return _fix_language_name(langfunc(name))
254 255 return handlelanguage 256
257 -def _fix_language_name(name):
258 """Identify and replace some unsightly names present in iso-codes. 259 260 If the name is present in _fixed_names we assume it is untranslated and 261 we replace it with a more usable rendering. If the remaining part is long 262 and includes a semi-colon, we only take the text up to the semi-colon to 263 keep things neat.""" 264 if name in _fixed_names: 265 return _fixed_names[name] 266 elif len(name) > 11: 267 # These constants are somewhat arbitrary, but testing with the Japanese 268 # translation of ISO codes suggests these as the upper bounds. 269 split_point = name[5:].find(u';') 270 if split_point >= 0: 271 return name[:5+split_point] 272 return name
273 274
275 -def gettext_lang(langcode=None):
276 """Returns a gettext function to translate language names into the given 277 language, or the system language if no language is specified.""" 278 if not langcode in iso639: 279 if not langcode: 280 langcode = "" 281 if os.name == "nt": 282 # On Windows the default locale is not used for some reason 283 t = gettext.translation('iso_639', languages=[locale.getdefaultlocale()[0]], fallback=True) 284 else: 285 t = gettext.translation('iso_639', fallback=True) 286 else: 287 t = gettext.translation('iso_639', languages=[langcode], fallback=True) 288 iso639[langcode] = t.ugettext 289 return iso639[langcode]
290
291 -def gettext_country(langcode=None):
292 """Returns a gettext function to translate country names into the given 293 language, or the system language if no language is specified.""" 294 if not langcode in iso3166: 295 if not langcode: 296 langcode = "" 297 if os.name == "nt": 298 # On Windows the default locale is not used for some reason 299 t = gettext.translation('iso_3166', languages=[locale.getdefaultlocale()[0]], fallback=True) 300 else: 301 t = gettext.translation('iso_3166', fallback=True) 302 else: 303 t = gettext.translation('iso_3166', languages=[langcode], fallback=True) 304 iso3166[langcode] = t.ugettext 305 return iso3166[langcode]
306
307 -def normalize(string, normal_form="NFC"):
308 """Return a unicode string in its normalized form 309 310 @param string: The string to be normalized 311 @param normal_form: NFC (default), NFD, NFKC, NFKD 312 @return: Normalized string 313 """ 314 if string is None: 315 return None 316 else: 317 return unicodedata.normalize(normal_form, string)
318
319 -def forceunicode(string):
320 """Ensures that the string is in unicode. 321 322 @param string: A text string 323 @type string: Unicode, String 324 @return: String converted to Unicode and normalized as needed. 325 @rtype: Unicode 326 """ 327 if string is None: 328 return None 329 if isinstance(string, str): 330 encoding = getattr(string, "encoding", "utf-8") 331 string = string.decode(encoding) 332 elif isinstance(string, StringElem): 333 string = unicode(string) 334 return string
335
336 -def normalized_unicode(string):
337 """Forces the string to unicode and does normalization.""" 338 return normalize(forceunicode(string))
339
340 -def normalize_code(code):
341 if not code: 342 return code 343 return code.replace("_", "-").replace("@", "-").lower()
344
345 -def simplify_to_common(language_code, languages=languages):
346 """Simplify language code to the most commonly used form for the 347 language, stripping country information for languages that tend 348 not to be localized differently for different countries""" 349 simpler = simplercode(language_code) 350 if normalize_code(language_code) in [normalize_code(key) for key in languages.keys()] or simpler == "": 351 return language_code 352 else: 353 return simplify_to_common(simpler)
354