| OLD | NEW |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # -*- coding: utf-8 -*- | 2 # -*- coding: utf-8 -*- |
| 3 # | 3 # |
| 4 # Copyright 2007 Zuza Software Foundation | 4 # Copyright 2007 Zuza Software Foundation |
| 5 # | 5 # |
| 6 # the function "__str__" was derived from Python v2.4 | 6 # the function "__str__" was derived from Python v2.4 |
| 7 # (Tools/i18n/msgfmt.py - function "generate"): | 7 # (Tools/i18n/msgfmt.py - function "generate"): |
| 8 # Written by Martin v. Löwis <loewis@informatik.hu-berlin.de> | 8 # Written by Martin v. Löwis <loewis@informatik.hu-berlin.de> |
| 9 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. | 9 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. |
| 10 # All rights reserved. | 10 # All rights reserved. |
| 11 # original license: Python Software Foundation (version 2) | 11 # original license: Python Software Foundation (version 2) |
| 12 # | 12 # |
| 13 # | 13 # |
| 14 # This file is part of translate. | 14 # This file is part of translate. |
| 15 # | 15 # |
| 16 # translate is free software; you can redistribute it and/or modify | 16 # translate is free software; you can redistribute it and/or modify |
| 17 # it under the terms of the GNU General Public License as published by | 17 # it under the terms of the GNU General Public License as published by |
| 18 # the Free Software Foundation; either version 2 of the License, or | 18 # the Free Software Foundation; either version 2 of the License, or |
| 19 # (at your option) any later version. | 19 # (at your option) any later version. |
| 20 # | 20 # |
| 21 # translate is distributed in the hope that it will be useful, | 21 # translate is distributed in the hope that it will be useful, |
| 22 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 22 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 24 # GNU General Public License for more details. | 24 # GNU General Public License for more details. |
| 25 # | 25 # |
| 26 # You should have received a copy of the GNU General Public License | 26 # You should have received a copy of the GNU General Public License |
| 27 # along with translate; if not, write to the Free Software | 27 # along with translate; if not, write to the Free Software |
| 28 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 28 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 29 # | 29 # |
| 30 | 30 |
| 31 """Module for parsing Gettext .mo files for translation. | 31 """Module for parsing Gettext .mo files for translation. |
| 32 | 32 |
| 33 The coding of .mo files was produced from documentation in Gettext 0.16 and | 33 The coding of .mo files was produced from documentation in Gettext 0.16 and |
| 34 from observation and testing of existing .mo files in the wild. | 34 from observation and testing of existing .mo files in the wild. |
| 35 | 35 |
| 36 The class does not implement any of the hashing componets of Gettext. This | 36 The class does not implement any of the hashing componets of Gettext. This |
| 37 will probably make the output file slower in some instances. | 37 will probably make the output file slower in some instances. |
| 38 """ | 38 """ |
| 39 | 39 |
| 40 from translate.storage import base | 40 from translate.storage import base |
| 41 from translate.storage import po | 41 from translate.storage import po |
| 42 from translate.misc.multistring import multistring | 42 from translate.misc.multistring import multistring |
| 43 import struct | 43 import struct |
| 44 import array | 44 import array |
| 45 import re | 45 import re |
| 46 | 46 |
| 47 MO_MAGIC_NUMBER = 0x950412deL | 47 MO_MAGIC_NUMBER = 0x950412deL |
| 48 | 48 |
| 49 def mounpack(filename='messages.mo'): | 49 def mounpack(filename='messages.mo'): |
| 50 """Helper to unpack Gettext MO files into a Python string""" | 50 """Helper to unpack Gettext MO files into a Python string""" |
| 51 f = open(filename) | 51 f = open(filename) |
| 52 s = f.read() | 52 s = f.read() |
| 53 print "\\x%02x"*len(s) % tuple(map(ord, s)) | 53 print "\\x%02x"*len(s) % tuple(map(ord, s)) |
|
jean.jordaan
2008/07/17 07:33:46
This doesn't actually return a Python string. It l
| |
| 54 f.close() | 54 f.close() |
| 55 | 55 |
| 56 def my_swap4(result): | 56 def my_swap4(result): |
|
jean.jordaan
2008/07/17 07:33:46
Docstring?
| |
| 57 c0 = (result >> 0) & 0xff | 57 c0 = (result >> 0) & 0xff |
| 58 c1 = (result >> 8) & 0xff | 58 c1 = (result >> 8) & 0xff |
| 59 c2 = (result >> 16) & 0xff | 59 c2 = (result >> 16) & 0xff |
| 60 c3 = (result >> 24) & 0xff | 60 c3 = (result >> 24) & 0xff |
| 61 | 61 |
| 62 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3 | 62 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3 |
| 63 | 63 |
| 64 def hashpjw(str_param): | 64 def hashpjw(str_param): |
|
jean.jordaan
2008/07/17 07:33:46
English variable names please?
| |
| 65 HASHWORDBITS = 32 | 65 HASHWORDBITS = 32 |
| 66 hval = 0 | 66 hval = 0 |
| 67 g = None | 67 g = None |
| 68 s = str_param | 68 s = str_param |
| 69 for s in str_param: | 69 for s in str_param: |
| 70 hval = hval << 4 | 70 hval = hval << 4 |
| 71 hval += ord(s) | 71 hval += ord(s) |
| 72 g = hval & 0xf << (HASHWORDBITS - 4) | 72 g = hval & 0xf << (HASHWORDBITS - 4) |
| 73 if (g != 0): | 73 if (g != 0): |
| 74 hval = hval ^ g >> (HASHWORDBITS - 8) | 74 hval = hval ^ g >> (HASHWORDBITS - 8) |
| 75 hval = hval ^ g | 75 hval = hval ^ g |
| 76 return hval | 76 return hval |
| 77 | 77 |
| 78 | 78 |
| 79 class mounit(base.TranslationUnit): | 79 class mounit(base.TranslationUnit): |
| 80 """A class representing a .mo translation message.""" | 80 """A class representing a .mo translation message.""" |
| 81 def __init__(self, source=None): | 81 def __init__(self, source=None): |
| 82 self.msgctxt = [] | 82 self.msgctxt = [] |
| 83 self.msgidcomments = [] | 83 self.msgidcomments = [] |
| 84 super(mounit, self).__init__(source) | 84 super(mounit, self).__init__(source) |
| 85 | 85 |
| 86 def getcontext(self): | 86 def getcontext(self): |
| 87 """Get the message context""" | 87 """Get the message context""" |
| 88 # Still need to handle KDE comments | 88 # Still need to handle KDE comments |
| 89 if self.msgctxt is None: | 89 if self.msgctxt is None: |
| 90 return None | 90 return None |
| 91 return "".join(self.msgctxt) | 91 return "".join(self.msgctxt) |
| 92 | 92 |
| 93 def isheader(self): | 93 def isheader(self): |
| 94 """Is this a header entry?""" | 94 """Is this a header entry?""" |
| 95 return self.source == "" | 95 return self.source == "" |
| 96 | 96 |
| 97 def istranslatable(self): | 97 def istranslatable(self): |
| 98 """Is this message translateable?""" | 98 """Is this message translateable?""" |
| 99 return bool(self.source) | 99 return bool(self.source) |
| 100 | 100 |
| 101 class mofile(base.TranslationStore): | 101 class mofile(base.TranslationStore): |
| 102 """A class representing a .mo file.""" | 102 """A class representing a .mo file.""" |
| 103 UnitClass = mounit | 103 UnitClass = mounit |
| 104 name = "Gettext MO file" | |
| 105 extension = ("*.mo", "*.gmo") | |
| 106 mimetype = ("application/x-gettext-catalog", ) | |
| 104 def __init__(self, inputfile=None, unitclass=mounit): | 107 def __init__(self, inputfile=None, unitclass=mounit): |
|
jean.jordaan
2008/07/17 07:33:46
Here and elsewhere:
"Method definitions inside a c
| |
| 105 self.UnitClass = unitclass | 108 self.UnitClass = unitclass |
| 106 base.TranslationStore.__init__(self, unitclass=unitclass) | 109 base.TranslationStore.__init__(self, unitclass=unitclass) |
| 107 self.units = [] | 110 self.units = [] |
| 108 self.filename = '' | 111 self.filename = '' |
| 109 if inputfile is not None: | 112 if inputfile is not None: |
| 110 self.parsestring(inputfile) | 113 self.parsestring(inputfile) |
| 111 | 114 |
| 112 def __str__(self): | 115 def __str__(self): |
| 113 """Output a string representation of the MO data file""" | 116 """Output a string representation of the MO data file""" |
| 114 # check the header of this file for the copyright note of this function | 117 # check the header of this file for the copyright note of this function |
| 115 def add_to_hash_table(string, i): | 118 def add_to_hash_table(string, i): |
| 116 V = hashpjw(string) | 119 V = hashpjw(string) |
| 117 S = hash_size <= 2 and 3 or hash_size # Taken from gettext-0.17:gett ext-tools/src/wrote-mo.c:408-409 | 120 S = hash_size <= 2 and 3 or hash_size # Taken from gettext-0.17:gett ext-tools/src/wrote-mo.c:408-409 |
| 118 hash_cursor = V % S; | 121 hash_cursor = V % S; |
|
jean.jordaan
2008/07/17 07:33:46
Would it hurt to expand V and S to words?
| |
| 119 orig_hash_cursor = hash_cursor; | 122 orig_hash_cursor = hash_cursor; |
| 120 increment = 1 + (V % (S - 2)); | 123 increment = 1 + (V % (S - 2)); |
| 121 while True: | 124 while True: |
| 122 index = hash_table[hash_cursor] | 125 index = hash_table[hash_cursor] |
| 123 if (index == 0): | 126 if (index == 0): |
| 124 hash_table[hash_cursor] = i + 1 | 127 hash_table[hash_cursor] = i + 1 |
| 125 break | 128 break |
| 126 hash_cursor += increment | 129 hash_cursor += increment |
| 127 hash_cursor = hash_cursor % S | 130 hash_cursor = hash_cursor % S |
| 128 assert(hash_cursor != orig_hash_cursor) | 131 assert(hash_cursor != orig_hash_cursor) |
| 129 | 132 |
| 130 if len(self.units) == 0: | 133 if len(self.units) == 0: |
| 131 return '' | 134 return '' |
| 132 hash_size = int(len(self.units) * 1.4) | 135 hash_size = int(len(self.units) * 1.4) |
| 133 MESSAGES = {} | 136 MESSAGES = {} |
| 134 for unit in self.units: | 137 for unit in self.units: |
| 135 if isinstance(unit.source, multistring): | 138 if isinstance(unit.source, multistring): |
| 136 source = "".join(unit.msgidcomments) + "\0".join(unit.source.str ings) | 139 source = "".join(unit.msgidcomments) + "\0".join(unit.source.str ings) |
| 137 else: | 140 else: |
| 138 source = "".join(unit.msgidcomments) + unit.source | 141 source = "".join(unit.msgidcomments) + unit.source |
| 139 if unit.msgctxt: | 142 if unit.msgctxt: |
| 140 source = "".join(unit.msgctxt) + "\x04" + source | 143 source = "".join(unit.msgctxt) + "\x04" + source |
| 141 if isinstance(unit.target, multistring): | 144 if isinstance(unit.target, multistring): |
| 142 target = "\0".join(unit.target.strings) | 145 target = "\0".join(unit.target.strings) |
| 143 else: | 146 else: |
| 144 target = unit.target | 147 target = unit.target |
| 145 if unit.target: | 148 if unit.target: |
| 146 MESSAGES[source.encode("utf-8")] = target | 149 MESSAGES[source.encode("utf-8")] = target |
| 147 hash_table = array.array("L", [0] * hash_size) | 150 hash_table = array.array("L", [0] * hash_size) |
|
jean.jordaan
2008/07/17 07:33:46
Hash tables are useful for many things. Could the
| |
| 148 keys = MESSAGES.keys() | 151 keys = MESSAGES.keys() |
| 149 # the keys are sorted in the .mo file | 152 # the keys are sorted in the .mo file |
| 150 keys.sort() | 153 keys.sort() |
| 151 offsets = [] | 154 offsets = [] |
| 152 ids = strs = '' | 155 ids = strs = '' |
| 153 for i, id in enumerate(keys): | 156 for i, id in enumerate(keys): |
| 154 # For each string, we need size and file offset. Each string is NUL | 157 # For each string, we need size and file offset. Each string is NUL |
| 155 # terminated; the NUL does not count into the size. | 158 # terminated; the NUL does not count into the size. |
| 156 # TODO: We don't do any encoding detection from the PO Header | 159 # TODO: We don't do any encoding detection from the PO Header |
| 157 add_to_hash_table(id, i) | 160 add_to_hash_table(id, i) |
| 158 string = MESSAGES[id] # id is already encoded for use as a dictionar y key | 161 string = MESSAGES[id] # id is already encoded for use as a dictionar y key |
|
jean.jordaan
2008/07/17 07:33:46
Masks 'string' module; undescriptive in same sense
| |
| 159 if isinstance(string, unicode): | 162 if isinstance(string, unicode): |
| 160 string = string.encode('utf-8') | 163 string = string.encode('utf-8') |
| 161 offsets.append((len(ids), len(id), len(strs), len(string))) | 164 offsets.append((len(ids), len(id), len(strs), len(string))) |
| 162 ids = ids + id + '\0' | 165 ids = ids + id + '\0' |
| 163 strs = strs + string + '\0' | 166 strs = strs + string + '\0' |
| 164 output = '' | 167 output = '' |
| 165 # The header is 7 32-bit unsigned integers. We don't use hash tables, s o | 168 # The header is 7 32-bit unsigned integers. We don't use hash tables, s o |
| 166 # the keys start right after the index tables. | 169 # the keys start right after the index tables. |
| 167 # translated string. | 170 # translated string. |
| 168 keystart = 7*4+16*len(keys)+hash_size*4 | 171 keystart = 7*4+16*len(keys)+hash_size*4 |
| 169 # and the values start after the keys | 172 # and the values start after the keys |
| 170 valuestart = keystart + len(ids) | 173 valuestart = keystart + len(ids) |
| 171 koffsets = [] | 174 koffsets = [] |
| 172 voffsets = [] | 175 voffsets = [] |
| 173 # The string table first has the list of keys, then the list of values. | 176 # The string table first has the list of keys, then the list of values. |
| 174 # Each entry has first the size of the string, then the file offset. | 177 # Each entry has first the size of the string, then the file offset. |
| 175 for o1, l1, o2, l2 in offsets: | 178 for o1, l1, o2, l2 in offsets: |
| 176 koffsets = koffsets + [l1, o1+keystart] | 179 koffsets = koffsets + [l1, o1+keystart] |
| 177 voffsets = voffsets + [l2, o2+valuestart] | 180 voffsets = voffsets + [l2, o2+valuestart] |
| 178 offsets = koffsets + voffsets | 181 offsets = koffsets + voffsets |
| 179 output = struct.pack("Iiiiiii", | 182 output = struct.pack("Iiiiiii", |
| 180 MO_MAGIC_NUMBER, # Magic | 183 MO_MAGIC_NUMBER, # Magic |
| 181 0, # Version | 184 0, # Version |
| 182 len(keys), # # of entries | 185 len(keys), # # of entries |
| 183 7*4, # start of key index | 186 7*4, # start of key index |
| 184 7*4+len(keys)*8, # start of value index | 187 7*4+len(keys)*8, # start of value index |
| 185 hash_size, 7*4+2*(len(keys)*8)) # size and offset of hash table | 188 hash_size, 7*4+2*(len(keys)*8)) # size and offset of hash table |
| 186 output = output + array.array("i", offsets).tostring() | 189 output = output + array.array("i", offsets).tostring() |
| 187 output = output + hash_table.tostring() | 190 output = output + hash_table.tostring() |
| 188 output = output + ids | 191 output = output + ids |
| 189 output = output + strs | 192 output = output + strs |
| 190 return output | 193 return output |
| 191 | 194 |
| 192 def parse(self, input): | 195 def parse(self, input): |
| 193 """parses the given file or file source string""" | 196 """parses the given file or file source string""" |
| 194 if hasattr(input, 'name'): | 197 if hasattr(input, 'name'): |
| 195 self.filename = input.name | 198 self.filename = input.name |
| 196 elif not getattr(self, 'filename', ''): | 199 elif not getattr(self, 'filename', ''): |
| 197 self.filename = '' | 200 self.filename = '' |
| 198 if hasattr(input, "read"): | 201 if hasattr(input, "read"): |
| 199 mosrc = input.read() | 202 mosrc = input.read() |
| 200 input.close() | 203 input.close() |
| 201 input = mosrc | 204 input = mosrc |
| 202 little, = struct.unpack("<L", input[:4]) | 205 little, = struct.unpack("<L", input[:4]) |
| 203 big, = struct.unpack(">L", input[:4]) | 206 big, = struct.unpack(">L", input[:4]) |
| 204 if little == MO_MAGIC_NUMBER: | 207 if little == MO_MAGIC_NUMBER: |
| 205 endian = "<" | 208 endian = "<" |
| 206 elif big == MO_MAGIC_NUMBER: | 209 elif big == MO_MAGIC_NUMBER: |
| 207 endian = ">" | 210 endian = ">" |
| 208 else: | 211 else: |
| 209 raise ValueError("This is not an MO file") | 212 raise ValueError("This is not an MO file") |
| 210 magic, version, lenkeys, startkey, startvalue, sizehash, offsethash = st ruct.unpack("%sLiiiiii" % endian, input[:(7*4)]) | 213 magic, version, lenkeys, startkey, startvalue, sizehash, offsethash = st ruct.unpack("%sLiiiiii" % endian, input[:(7*4)]) |
| 211 if version > 1: | 214 if version > 1: |
| 212 raise ValueError("Unable to process MO files with versions > 1. Thi s is a %d version MO file" % version) | 215 raise ValueError("Unable to process MO files with versions > 1. Thi s is a %d version MO file" % version) |
| 213 encoding = 'UTF-8' | 216 encoding = 'UTF-8' |
| 214 for i in range(lenkeys): | 217 for i in range(lenkeys): |
| 215 nextkey = startkey+(i*2*4) | 218 nextkey = startkey+(i*2*4) |
| 216 nextvalue = startvalue+(i*2*4) | 219 nextvalue = startvalue+(i*2*4) |
| 217 klength, koffset = struct.unpack("%sii" % endian, input[nextkey:next key+(2*4)]) | 220 klength, koffset = struct.unpack("%sii" % endian, input[nextkey:next key+(2*4)]) |
| 218 vlength, voffset = struct.unpack("%sii" % endian, input[nextvalue:ne xtvalue+(2*4)]) | 221 vlength, voffset = struct.unpack("%sii" % endian, input[nextvalue:ne xtvalue+(2*4)]) |
| 219 source = input[koffset:koffset+klength] | 222 source = input[koffset:koffset+klength] |
| 220 context = None | 223 context = None |
| 221 if "\x04" in source: | 224 if "\x04" in source: |
| 222 context, source = source.split("\x04") | 225 context, source = source.split("\x04") |
|
jean.jordaan
2008/07/17 07:33:46
It looks like you know for certain source will nev
| |
| 223 # Still need to handle KDE comments | 226 # Still need to handle KDE comments |
| 224 source = multistring(source.split("\0"), encoding=encoding) | 227 source = multistring(source.split("\0"), encoding=encoding) |
| 225 if source == "": | 228 if source == "": |
| 226 charset = re.search("charset=([^\\s]+)", input[voffset:voffset+v length]) | 229 charset = re.search("charset=([^\\s]+)", input[voffset:voffset+v length]) |
| 227 if charset: | 230 if charset: |
| 228 encoding = po.encodingToUse(charset.group(1)) | 231 encoding = po.encodingToUse(charset.group(1)) |
| 229 target = multistring(input[voffset:voffset+vlength].split("\0"), enc oding=encoding) | 232 target = multistring(input[voffset:voffset+vlength].split("\0"), enc oding=encoding) |
| 230 newunit = mounit(source) | 233 newunit = mounit(source) |
| 231 newunit.settarget(target) | 234 newunit.settarget(target) |
| 232 if context is not None: | 235 if context is not None: |
| 233 newunit.msgctxt.append(context) | 236 newunit.msgctxt.append(context) |
| 234 self.addunit(newunit) | 237 self.addunit(newunit) |
| OLD | NEW |