Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(8)

Side by Side Diff: translate/storage/mo.py

Issue 4: Format knowledge in format classes SVN Base: https://translate.svn.sourceforge.net/svnroot/translate/src/trunk/
Patch Set: Second try Created 1 year, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 # 3 #
4 # Copyright 2007 Zuza Software Foundation 4 # Copyright 2007 Zuza Software Foundation
5 # 5 #
6 # the function "__str__" was derived from Python v2.4 6 # the function "__str__" was derived from Python v2.4
7 # (Tools/i18n/msgfmt.py - function "generate"): 7 # (Tools/i18n/msgfmt.py - function "generate"):
8 # Written by Martin v. Löwis <loewis@informatik.hu-berlin.de> 8 # Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
9 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 9 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
10 # All rights reserved. 10 # All rights reserved.
11 # original license: Python Software Foundation (version 2) 11 # original license: Python Software Foundation (version 2)
12 # 12 #
13 # 13 #
14 # This file is part of translate. 14 # This file is part of translate.
15 # 15 #
16 # translate is free software; you can redistribute it and/or modify 16 # translate is free software; you can redistribute it and/or modify
17 # it under the terms of the GNU General Public License as published by 17 # it under the terms of the GNU General Public License as published by
18 # the Free Software Foundation; either version 2 of the License, or 18 # the Free Software Foundation; either version 2 of the License, or
19 # (at your option) any later version. 19 # (at your option) any later version.
20 # 20 #
21 # translate is distributed in the hope that it will be useful, 21 # translate is distributed in the hope that it will be useful,
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of 22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 # GNU General Public License for more details. 24 # GNU General Public License for more details.
25 # 25 #
26 # You should have received a copy of the GNU General Public License 26 # You should have received a copy of the GNU General Public License
27 # along with translate; if not, write to the Free Software 27 # along with translate; if not, write to the Free Software
28 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 28 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 # 29 #
30 30
31 """Module for parsing Gettext .mo files for translation. 31 """Module for parsing Gettext .mo files for translation.
32 32
33 The coding of .mo files was produced from documentation in Gettext 0.16 and 33 The coding of .mo files was produced from documentation in Gettext 0.16 and
34 from observation and testing of existing .mo files in the wild. 34 from observation and testing of existing .mo files in the wild.
35 35
36 The class does not implement any of the hashing componets of Gettext. This 36 The class does not implement any of the hashing componets of Gettext. This
37 will probably make the output file slower in some instances. 37 will probably make the output file slower in some instances.
38 """ 38 """
39 39
40 from translate.storage import base 40 from translate.storage import base
41 from translate.storage import po 41 from translate.storage import po
42 from translate.misc.multistring import multistring 42 from translate.misc.multistring import multistring
43 import struct 43 import struct
44 import array 44 import array
45 import re 45 import re
46 46
47 MO_MAGIC_NUMBER = 0x950412deL 47 MO_MAGIC_NUMBER = 0x950412deL
48 48
49 def mounpack(filename='messages.mo'): 49 def mounpack(filename='messages.mo'):
50 """Helper to unpack Gettext MO files into a Python string""" 50 """Helper to unpack Gettext MO files into a Python string"""
51 f = open(filename) 51 f = open(filename)
52 s = f.read() 52 s = f.read()
53 print "\\x%02x"*len(s) % tuple(map(ord, s)) 53 print "\\x%02x"*len(s) % tuple(map(ord, s))
jean.jordaan 2008/07/17 07:33:46 This doesn't actually return a Python string. It l
54 f.close() 54 f.close()
55 55
56 def my_swap4(result): 56 def my_swap4(result):
jean.jordaan 2008/07/17 07:33:46 Docstring?
57 c0 = (result >> 0) & 0xff 57 c0 = (result >> 0) & 0xff
58 c1 = (result >> 8) & 0xff 58 c1 = (result >> 8) & 0xff
59 c2 = (result >> 16) & 0xff 59 c2 = (result >> 16) & 0xff
60 c3 = (result >> 24) & 0xff 60 c3 = (result >> 24) & 0xff
61 61
62 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3 62 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
63 63
64 def hashpjw(str_param): 64 def hashpjw(str_param):
jean.jordaan 2008/07/17 07:33:46 English variable names please?
65 HASHWORDBITS = 32 65 HASHWORDBITS = 32
66 hval = 0 66 hval = 0
67 g = None 67 g = None
68 s = str_param 68 s = str_param
69 for s in str_param: 69 for s in str_param:
70 hval = hval << 4 70 hval = hval << 4
71 hval += ord(s) 71 hval += ord(s)
72 g = hval & 0xf << (HASHWORDBITS - 4) 72 g = hval & 0xf << (HASHWORDBITS - 4)
73 if (g != 0): 73 if (g != 0):
74 hval = hval ^ g >> (HASHWORDBITS - 8) 74 hval = hval ^ g >> (HASHWORDBITS - 8)
75 hval = hval ^ g 75 hval = hval ^ g
76 return hval 76 return hval
77 77
78 78
79 class mounit(base.TranslationUnit): 79 class mounit(base.TranslationUnit):
80 """A class representing a .mo translation message.""" 80 """A class representing a .mo translation message."""
81 def __init__(self, source=None): 81 def __init__(self, source=None):
82 self.msgctxt = [] 82 self.msgctxt = []
83 self.msgidcomments = [] 83 self.msgidcomments = []
84 super(mounit, self).__init__(source) 84 super(mounit, self).__init__(source)
85 85
86 def getcontext(self): 86 def getcontext(self):
87 """Get the message context""" 87 """Get the message context"""
88 # Still need to handle KDE comments 88 # Still need to handle KDE comments
89 if self.msgctxt is None: 89 if self.msgctxt is None:
90 return None 90 return None
91 return "".join(self.msgctxt) 91 return "".join(self.msgctxt)
92 92
93 def isheader(self): 93 def isheader(self):
94 """Is this a header entry?""" 94 """Is this a header entry?"""
95 return self.source == "" 95 return self.source == ""
96 96
97 def istranslatable(self): 97 def istranslatable(self):
98 """Is this message translateable?""" 98 """Is this message translateable?"""
99 return bool(self.source) 99 return bool(self.source)
100 100
101 class mofile(base.TranslationStore): 101 class mofile(base.TranslationStore):
102 """A class representing a .mo file.""" 102 """A class representing a .mo file."""
103 UnitClass = mounit 103 UnitClass = mounit
104 name = "Gettext MO file"
105 extension = ("*.mo", "*.gmo")
106 mimetype = ("application/x-gettext-catalog", )
104 def __init__(self, inputfile=None, unitclass=mounit): 107 def __init__(self, inputfile=None, unitclass=mounit):
jean.jordaan 2008/07/17 07:33:46 Here and elsewhere: "Method definitions inside a c
105 self.UnitClass = unitclass 108 self.UnitClass = unitclass
106 base.TranslationStore.__init__(self, unitclass=unitclass) 109 base.TranslationStore.__init__(self, unitclass=unitclass)
107 self.units = [] 110 self.units = []
108 self.filename = '' 111 self.filename = ''
109 if inputfile is not None: 112 if inputfile is not None:
110 self.parsestring(inputfile) 113 self.parsestring(inputfile)
111 114
112 def __str__(self): 115 def __str__(self):
113 """Output a string representation of the MO data file""" 116 """Output a string representation of the MO data file"""
114 # check the header of this file for the copyright note of this function 117 # check the header of this file for the copyright note of this function
115 def add_to_hash_table(string, i): 118 def add_to_hash_table(string, i):
116 V = hashpjw(string) 119 V = hashpjw(string)
117 S = hash_size <= 2 and 3 or hash_size # Taken from gettext-0.17:gett ext-tools/src/wrote-mo.c:408-409 120 S = hash_size <= 2 and 3 or hash_size # Taken from gettext-0.17:gett ext-tools/src/wrote-mo.c:408-409
118 hash_cursor = V % S; 121 hash_cursor = V % S;
jean.jordaan 2008/07/17 07:33:46 Would it hurt to expand V and S to words?
119 orig_hash_cursor = hash_cursor; 122 orig_hash_cursor = hash_cursor;
120 increment = 1 + (V % (S - 2)); 123 increment = 1 + (V % (S - 2));
121 while True: 124 while True:
122 index = hash_table[hash_cursor] 125 index = hash_table[hash_cursor]
123 if (index == 0): 126 if (index == 0):
124 hash_table[hash_cursor] = i + 1 127 hash_table[hash_cursor] = i + 1
125 break 128 break
126 hash_cursor += increment 129 hash_cursor += increment
127 hash_cursor = hash_cursor % S 130 hash_cursor = hash_cursor % S
128 assert(hash_cursor != orig_hash_cursor) 131 assert(hash_cursor != orig_hash_cursor)
129 132
130 if len(self.units) == 0: 133 if len(self.units) == 0:
131 return '' 134 return ''
132 hash_size = int(len(self.units) * 1.4) 135 hash_size = int(len(self.units) * 1.4)
133 MESSAGES = {} 136 MESSAGES = {}
134 for unit in self.units: 137 for unit in self.units:
135 if isinstance(unit.source, multistring): 138 if isinstance(unit.source, multistring):
136 source = "".join(unit.msgidcomments) + "\0".join(unit.source.str ings) 139 source = "".join(unit.msgidcomments) + "\0".join(unit.source.str ings)
137 else: 140 else:
138 source = "".join(unit.msgidcomments) + unit.source 141 source = "".join(unit.msgidcomments) + unit.source
139 if unit.msgctxt: 142 if unit.msgctxt:
140 source = "".join(unit.msgctxt) + "\x04" + source 143 source = "".join(unit.msgctxt) + "\x04" + source
141 if isinstance(unit.target, multistring): 144 if isinstance(unit.target, multistring):
142 target = "\0".join(unit.target.strings) 145 target = "\0".join(unit.target.strings)
143 else: 146 else:
144 target = unit.target 147 target = unit.target
145 if unit.target: 148 if unit.target:
146 MESSAGES[source.encode("utf-8")] = target 149 MESSAGES[source.encode("utf-8")] = target
147 hash_table = array.array("L", [0] * hash_size) 150 hash_table = array.array("L", [0] * hash_size)
jean.jordaan 2008/07/17 07:33:46 Hash tables are useful for many things. Could the
148 keys = MESSAGES.keys() 151 keys = MESSAGES.keys()
149 # the keys are sorted in the .mo file 152 # the keys are sorted in the .mo file
150 keys.sort() 153 keys.sort()
151 offsets = [] 154 offsets = []
152 ids = strs = '' 155 ids = strs = ''
153 for i, id in enumerate(keys): 156 for i, id in enumerate(keys):
154 # For each string, we need size and file offset. Each string is NUL 157 # For each string, we need size and file offset. Each string is NUL
155 # terminated; the NUL does not count into the size. 158 # terminated; the NUL does not count into the size.
156 # TODO: We don't do any encoding detection from the PO Header 159 # TODO: We don't do any encoding detection from the PO Header
157 add_to_hash_table(id, i) 160 add_to_hash_table(id, i)
158 string = MESSAGES[id] # id is already encoded for use as a dictionar y key 161 string = MESSAGES[id] # id is already encoded for use as a dictionar y key
jean.jordaan 2008/07/17 07:33:46 Masks 'string' module; undescriptive in same sense
159 if isinstance(string, unicode): 162 if isinstance(string, unicode):
160 string = string.encode('utf-8') 163 string = string.encode('utf-8')
161 offsets.append((len(ids), len(id), len(strs), len(string))) 164 offsets.append((len(ids), len(id), len(strs), len(string)))
162 ids = ids + id + '\0' 165 ids = ids + id + '\0'
163 strs = strs + string + '\0' 166 strs = strs + string + '\0'
164 output = '' 167 output = ''
165 # The header is 7 32-bit unsigned integers. We don't use hash tables, s o 168 # The header is 7 32-bit unsigned integers. We don't use hash tables, s o
166 # the keys start right after the index tables. 169 # the keys start right after the index tables.
167 # translated string. 170 # translated string.
168 keystart = 7*4+16*len(keys)+hash_size*4 171 keystart = 7*4+16*len(keys)+hash_size*4
169 # and the values start after the keys 172 # and the values start after the keys
170 valuestart = keystart + len(ids) 173 valuestart = keystart + len(ids)
171 koffsets = [] 174 koffsets = []
172 voffsets = [] 175 voffsets = []
173 # The string table first has the list of keys, then the list of values. 176 # The string table first has the list of keys, then the list of values.
174 # Each entry has first the size of the string, then the file offset. 177 # Each entry has first the size of the string, then the file offset.
175 for o1, l1, o2, l2 in offsets: 178 for o1, l1, o2, l2 in offsets:
176 koffsets = koffsets + [l1, o1+keystart] 179 koffsets = koffsets + [l1, o1+keystart]
177 voffsets = voffsets + [l2, o2+valuestart] 180 voffsets = voffsets + [l2, o2+valuestart]
178 offsets = koffsets + voffsets 181 offsets = koffsets + voffsets
179 output = struct.pack("Iiiiiii", 182 output = struct.pack("Iiiiiii",
180 MO_MAGIC_NUMBER, # Magic 183 MO_MAGIC_NUMBER, # Magic
181 0, # Version 184 0, # Version
182 len(keys), # # of entries 185 len(keys), # # of entries
183 7*4, # start of key index 186 7*4, # start of key index
184 7*4+len(keys)*8, # start of value index 187 7*4+len(keys)*8, # start of value index
185 hash_size, 7*4+2*(len(keys)*8)) # size and offset of hash table 188 hash_size, 7*4+2*(len(keys)*8)) # size and offset of hash table
186 output = output + array.array("i", offsets).tostring() 189 output = output + array.array("i", offsets).tostring()
187 output = output + hash_table.tostring() 190 output = output + hash_table.tostring()
188 output = output + ids 191 output = output + ids
189 output = output + strs 192 output = output + strs
190 return output 193 return output
191 194
192 def parse(self, input): 195 def parse(self, input):
193 """parses the given file or file source string""" 196 """parses the given file or file source string"""
194 if hasattr(input, 'name'): 197 if hasattr(input, 'name'):
195 self.filename = input.name 198 self.filename = input.name
196 elif not getattr(self, 'filename', ''): 199 elif not getattr(self, 'filename', ''):
197 self.filename = '' 200 self.filename = ''
198 if hasattr(input, "read"): 201 if hasattr(input, "read"):
199 mosrc = input.read() 202 mosrc = input.read()
200 input.close() 203 input.close()
201 input = mosrc 204 input = mosrc
202 little, = struct.unpack("<L", input[:4]) 205 little, = struct.unpack("<L", input[:4])
203 big, = struct.unpack(">L", input[:4]) 206 big, = struct.unpack(">L", input[:4])
204 if little == MO_MAGIC_NUMBER: 207 if little == MO_MAGIC_NUMBER:
205 endian = "<" 208 endian = "<"
206 elif big == MO_MAGIC_NUMBER: 209 elif big == MO_MAGIC_NUMBER:
207 endian = ">" 210 endian = ">"
208 else: 211 else:
209 raise ValueError("This is not an MO file") 212 raise ValueError("This is not an MO file")
210 magic, version, lenkeys, startkey, startvalue, sizehash, offsethash = st ruct.unpack("%sLiiiiii" % endian, input[:(7*4)]) 213 magic, version, lenkeys, startkey, startvalue, sizehash, offsethash = st ruct.unpack("%sLiiiiii" % endian, input[:(7*4)])
211 if version > 1: 214 if version > 1:
212 raise ValueError("Unable to process MO files with versions > 1. Thi s is a %d version MO file" % version) 215 raise ValueError("Unable to process MO files with versions > 1. Thi s is a %d version MO file" % version)
213 encoding = 'UTF-8' 216 encoding = 'UTF-8'
214 for i in range(lenkeys): 217 for i in range(lenkeys):
215 nextkey = startkey+(i*2*4) 218 nextkey = startkey+(i*2*4)
216 nextvalue = startvalue+(i*2*4) 219 nextvalue = startvalue+(i*2*4)
217 klength, koffset = struct.unpack("%sii" % endian, input[nextkey:next key+(2*4)]) 220 klength, koffset = struct.unpack("%sii" % endian, input[nextkey:next key+(2*4)])
218 vlength, voffset = struct.unpack("%sii" % endian, input[nextvalue:ne xtvalue+(2*4)]) 221 vlength, voffset = struct.unpack("%sii" % endian, input[nextvalue:ne xtvalue+(2*4)])
219 source = input[koffset:koffset+klength] 222 source = input[koffset:koffset+klength]
220 context = None 223 context = None
221 if "\x04" in source: 224 if "\x04" in source:
222 context, source = source.split("\x04") 225 context, source = source.split("\x04")
jean.jordaan 2008/07/17 07:33:46 It looks like you know for certain source will nev
223 # Still need to handle KDE comments 226 # Still need to handle KDE comments
224 source = multistring(source.split("\0"), encoding=encoding) 227 source = multistring(source.split("\0"), encoding=encoding)
225 if source == "": 228 if source == "":
226 charset = re.search("charset=([^\\s]+)", input[voffset:voffset+v length]) 229 charset = re.search("charset=([^\\s]+)", input[voffset:voffset+v length])
227 if charset: 230 if charset:
228 encoding = po.encodingToUse(charset.group(1)) 231 encoding = po.encodingToUse(charset.group(1))
229 target = multistring(input[voffset:voffset+vlength].split("\0"), enc oding=encoding) 232 target = multistring(input[voffset:voffset+vlength].split("\0"), enc oding=encoding)
230 newunit = mounit(source) 233 newunit = mounit(source)
231 newunit.settarget(target) 234 newunit.settarget(target)
232 if context is not None: 235 if context is not None:
233 newunit.msgctxt.append(context) 236 newunit.msgctxt.append(context)
234 self.addunit(newunit) 237 self.addunit(newunit)
OLDNEW

Powered by Google App Engine
This is Rietveld r159