-
Notifications
You must be signed in to change notification settings - Fork 3
/
storage.py
601 lines (550 loc) · 17.4 KB
/
storage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
#
# Our storage layer.
# We have one or more storage pools. Storage pools are used to get objects,
# which are named by Unix-format paths relative to the root of the storage
# area.
# Storage objects tell us four pieces of information: does this object
# exist, is it displayable, what is its contents (if it is displayable)
# and what is its history (and if history is available).
# Storage pools return None on get()s if something bad is going on.
# Otherwise they return objects, even for things that don't exist.
# Storage retrieval is case-sensitive.
from __future__ import with_statement
import rcsfile
import os, os.path, pwd, stat
import hashlib
import time, cPickle, tempfile
import derrors, utils
#
# It is worth optimizing stat() calls.
def fillStat(fname):
try:
return os.lstat(fname)
except EnvironmentError:
return None
def stat_isdir(st):
return st and stat.S_ISDIR(st.st_mode)
def stat_islink(st):
return st and stat.S_ISLNK(st.st_mode)
def join2(a, b):
return a + os.path.sep + b
# Implicitly, a plain file object is not displayable if it has no contents
# at all.
sBad, sInconsist, sGood, sLocked, sNoRCS = range(5)
noStat = object()
class FileObj(object):
type = "file"
def __init__(self, fname, st = noStat, omode = "r"):
# fname is fully resolvable
if st is noStat:
st = fillStat(fname)
self.real = bool(st)
self.name = fname
self.fstore = None
self._state = None
self._st = st
self._omode = omode
# self._state insures we only try to fill once.
def fill(self):
if not self.real or self._state != None:
return
try:
with open(self.name, self._omode) as fp:
self.fstore = fp.read()
self._state = sGood
except EnvironmentError:
self._state = sBad
# We are displayable if we have contents.
# We exist if we are real.
def displayable(self):
self.fill()
return bool(self.fstore)
def exists(self):
return self.real
def inconsistent(self):
return False
def contents(self):
self.fill()
return self.fstore
# Non-VCS files have no history.
def history(self):
return None
def hashistory(self):
return False
# But everyone has modification times.
# (This remains valid for RCS files too, because we take the
# content from the raw file not the RCS file, so we take the
# timestamp from there too.)
def timestamp(self):
if self.real:
return self._st.st_mtime
else:
return None
# This is a crappy name for the ctime.
def modstamp(self):
if self.real:
return self._st.st_ctime
else:
return None
# The owner of a normal file is a pretty simple idea in
# theory, but kind of annoying to implement in practice.
# FIXME: pwd.getpwuid() is a hot path for Atom feed generation
# and should be cacheable somehow.
def owner(self):
if not self.real:
return None
st = self._st
try:
pwe = pwd.getpwuid(st.st_uid)
return pwe.pw_name
except KeyError:
return "#%d" % st.st_uid
# A file's identity is its (st_dev, st_ino) combination.
# This should be treated as an opaque identifier by all
# callers.
# Note that this is not persistent over the long term;
# it is strictly useful to detect hardlinked files in
# the short term (one processing pass).
def identity(self):
if not self.real:
return None
st = self._st
return (st.st_dev, st.st_ino)
# We might as well expose these directly on the page,
# instead of forcing people into peculiar fishing expeditions
# through the page store. Among other things, this means that
# we don't have to tediously revalidate the path.
def is_link(self):
if not self.real:
return False
else:
return stat_islink(self._st)
def linkval(self):
if not self.is_link():
return None
return os.readlink(self.name)
# RCS file objects treat the non-RCS file as a cache for the real
# file, which is in RCS. They only exist if both the cache file
# and the real file exist. (This is debateable, but I prefer it
# this way -- you can pull the checked out version to invalidate
# a file.)
# As a bonus, if there is no RCS file the file is still real.
# This turns out to be way what I want in practice.
class RCSFileObj(FileObj):
def __init__(self, fname, rname, st = None):
FileObj.__init__(self, fname, st)
self.rname = rname
self.rfo = None
self._state = None
def fill(self):
if not self.real or self._state != None:
return
# At this point, self.rfo and self.fstore should be
# empty.
self.rfo = rcsfile.rcsfile(self.rname)
try:
with open(self.name, "r") as fp:
self.fstore = fp.read()
except EnvironmentError:
pass
# Now we check consistency.
# First: real file must exist:
if not self.fstore:
self._state = sBad
elif not self.rfo:
self._state = sNoRCS
elif self.rfo.headlocked():
self._state = sLocked
# If head is unlocked, the on-disk file must be the
# same as the RCS file.
elif not rcsfile.consistent(self.rfo, self.fstore):
self._state = sInconsist
# Mark us as non-displayable:
self.fstore = None
else:
self._state = sGood
def inconsistent(self):
return self._state == sInconsist
def history(self):
if not self.displayable() or self._state == sNoRCS:
return None
# We want revision history in newest to oldest.
revs = self.rfo.fullrevs()
revs.reverse()
return revs
def hashistory(self):
return self.displayable() and self._state != sNoRCS
def isdirty(self):
if self._state == sInconsist:
return True
elif self._state in (sBad, sGood, sNoRCS):
return False
else:
assert(self._state == sLocked)
return not rcsfile.consistent(self.rfo, self.fstore)
def current_user(self):
if not self.hashistory():
return None
return self.rfo.wholocked(self.rfo.head())
def histtimestamp(self):
if self._state not in (sBad, sNoRCS):
return os.path.getmtime(self.rname)
else:
return None
# Ownership is fun! For the whole family!
# If we have RCS, our owner is either the current locker or
# the person who checked in the head version.
def owner(self):
if not self.hashistory():
return super(RCSFileObj, self).owner()
hver = self.rfo.head()
lck = self.rfo.wholocked(hver)
if lck:
return lck
return self.rfo.revinfo(hver)[0]
# Directories have as contents a list of their constituent bits,
# minus things that start with a dot and minus directories called
# RCS.
class DirObj(FileObj):
type = "dir"
def fill(self):
if self._state != None:
return
try:
self.fstore = [x for x in os.listdir(self.name)
if utils.good_path_elem(x)]
self.fstore.sort()
self._state = sGood
except EnvironmentError:
self._state = sBad
# We must override this because we want an empty directory to
# be good, not bad.
def displayable(self):
self.fill()
return self.fstore is not None
# This gives us the root directory for data and whether or not we have
# a separate directory for version control stuff.
# We make calls to a StoragePool to retrieve files from it, which
# gives us FileObjs, which we then mangle in various ways.
class StoragePool:
def __init__(self, cfginfo):
self.root = cfginfo['dirroot']
self.cache = {}
self.stcache = {}
self.cache_on = True
# To be valid, something must be a good path and every piece
# of it must be a good name.
def validname(self, relname):
if relname == '':
return True
return utils.goodpath(relname)
def get(self, relname, missIsNone = False):
# We don't need to revalidate the damn name if it's
# already in our cache.
if relname in self.cache:
return self.cache[relname]
if not self.validname(relname):
return None
bn = join2(self.root, relname)
st = self.getStat(bn)
# It is more convenient to some callers if we can
# return a None when no page exists than if we go
# to the work of returning a real file page object.
if not st and missIsNone:
return None
elif stat_isdir(st):
res = self.fromdir(relname, bn, st)
else:
res = self.fromfile(relname, bn, st)
if self.cache_on:
self.cache[relname] = res
return res
def flush(self):
self.cache.clear()
self.stcache.clear()
# When the cache is off, nothing new is added to it.
def set_cache(self, state):
self.cache_on = state
# INTERNAL INTERFACE: takes a full path name, not a relative
# name. This is dangerous.
def getStat(self, fname):
if fname in self.stcache:
return self.stcache[fname]
st = fillStat(fname)
if self.cache_on:
self.stcache[fname] = st
return st
# Get the type of a name as a string (or None if it doesn't exist).
# The name is *relative*.
def get_type(self, relname):
fname = join2(self.root, relname)
st = self.getStat(fname)
if not st:
return None
elif stat_isdir(st):
return "dir"
else:
return "file"
def fromdir(self, relname, fname, st):
__pychecker__ = "no-argsused"
return DirObj(fname, st)
def fromfile(self, relname, fname, st):
__pychecker__ = "no-argsused"
return FileObj(fname, st)
# This returns a list of (modtime, filename) pairs in
# no particular order. (You get to sort them if you
# want to.)
# The files may or may not be valid by the time you
# get around to looking at them.
# We list files, not directories, because directories
# are just an organizational structure.
def children(self, path):
if not utils.goodpath(path):
# With an iterator, this yields de nada
return
if path == '':
spath = self.root
else:
spath = join2(self.root, path)
pdirs = [spath,]
rlen = len(self.root)+1
while pdirs:
wd = pdirs.pop()
try:
names = os.listdir(wd)
except EnvironmentError:
continue
for name in names:
if not utils.good_path_elem(name):
continue
cn = join2(wd, name)
st = os.lstat(cn)
self.stcache[cn] = st
if stat.S_ISDIR(st.st_mode):
pdirs.append(cn)
elif stat.S_ISREG(st.st_mode):
cn = cn[rlen:]
# cn is guaranteed to be non-null.
# we can only get here if it's a
# regular file, and if it's a
# regular file it must be inside
# the root, which is a directory.
yield (st.st_mtime, cn)
# and we're done
# Note that unlike other checks, this checks only that the
# path is good not that the names in the path are good.
# It can thus be used to probe for the existence of
# names that cannot be shown.
def exists(self, relname):
# It is worth fast-casing this, because it avoids
# an unnecessary bogusname check.
jn = join2(self.root, relname)
if jn in self.stcache:
return bool(self.stcache[jn])
if utils.boguspath(relname):
return False
else:
st = self.getStat(jn)
return bool(st)
def linkval(self, path):
if not utils.goodpath(path):
return None
fpath = join2(self.root, path)
st = self.getStat(fpath)
if not stat_islink(st):
return None
return os.readlink(fpath)
# There are two variants of RCS storage pools.
# If given a 'rcsroot', RCS files are found in a parallel hierarchy
# (with no RCS/ directories, just with ,v on the end).
# Otherwise they are found in RCS/ directories in the regular root.
class RCSStoragePool(StoragePool):
def __init__(self, cfginfo):
StoragePool.__init__(self, cfginfo)
if 'rcsdir' in cfginfo:
self.rroot = cfginfo['rcsdir']
else:
self.rroot = None
def fromfile(self, relname, fname, st):
if self.rroot:
rname = join2(self.rroot, rcsfile.rcsbasename(relname))
else:
rname = join2(self.root, rcsfile.rcsdirname(relname))
#fname = join2(self.root, relname)
return RCSFileObj(fname, rname, st)
# Make a directory in the face of races.
def makedirs(name):
head, tail = os.path.split(name)
if not tail:
head, tail = os.path.split(head)
if head and tail and not os.path.exists(head):
makedirs(head)
try:
os.mkdir(name)
except EnvironmentError:
if not os.path.isdir(name):
raise
#
# Store comments in a read/write pool.
class CommentStoragePool(StoragePool):
def blobname(self, blob):
h = hashlib.sha1()
h.update(blob)
return h.hexdigest()
# The biggest and most complex operation is writing something
# new to the store.
# New blobs are stored in directories, which we make as necessary.
# They are named by the SHA1 of their contents, in hex form.
# FIXME: come up with a better encoding scheme.
# FIXME: come up with some better way of returning errors.
def newblob(self, where, blobstr):
if not self.validname(where):
raise derrors.IntErr("bad commentstore name: '%s'" % where)
loc = join2(self.root, where)
try:
if not os.path.isdir(loc):
makedirs(loc)
except EnvironmentError as e:
raise derrors.IOErr("could not make directory '%s': %s" % (loc, str(e)))
objname = self.blobname(blobstr)
pname = join2(loc, objname)
phase = "create"
try:
fd = os.open(pname, os.O_CREAT|os.O_EXCL|os.O_WRONLY,
0644)
phase = "write"
os.write(fd, blobstr)
os.fsync(fd)
# neurotic carefulness requires us to fsync the
# directory too.
try:
fd2 = os.open(loc, os.O_RDONLY)
os.fsync(fd2)
os.close(fd2)
except EnvironmentError:
pass
phase = "close"
os.close(fd)
except EnvironmentError as e:
# It exists already, so we 'succeeded' as it were.
if phase == "create" and os.path.exists(pname):
return True
if phase == "write":
os.close(fd)
raise derrors.IOErr("could not %s file '%s': %s" % (phase, pname, str(e)))
return True
#
# Cache storage is also a read/write pool. Like comments, the interface
# is deliberately restricted.
#
# The cache is addressed by a zone / host / path / key quad. Zone and key
# are single path elements; path is a valid regular path. To make sure
# we can never collide with valid sub-paths, the keys always have a
# '~' appended to the end internally.
#
# Cache cleaning is explicitly not addressed by this cache storage
# module. The author considers unlink() dangerous; the rename() that
# the store code does is already bad enough. Clean the cache outside
# of dwiki with your favorite mechanism.
class CacheStoragePool(StoragePool):
def validate_quad(self, zone, host, path, key):
if not self.validname(path) or \
'/' in key or '/' in zone or '/' in host or \
not utils.good_path_elem(host) or \
not utils.good_path_elem(zone) or \
not utils.good_path_elem(key):
raise derrors.CacheKeyErr("bad zone / host / path / key in cachestore: '%s' '%s' '%s' '%s'" % (zone, host, path, key))
def get(self, relname, missIsNone = False):
__pychecker__ = "no-argsused"
raise derrors.IntErr("using get on a cachestore")
def fetch(self, zone, host, path, key = "default", TTL = None):
self.validate_quad(zone, host, path, key)
key = key + '~'
fname = os.path.sep.join([self.root, zone, host, path, key])
st = self.getStat(fname)
now = time.time()
if not st:
return None
elif not stat.S_ISREG(st.st_mode):
raise derrors.IntErr("not a regular file in cachestore: '%s' '%s' '%s' '%s'" % (zone, host, path, key))
elif TTL and (st.st_mtime + TTL) < now:
return None
# Finally we can get and retrieve the cached file.
# Cached file objects are always depickled before
# getting returned. (We use FileObjs because they
# handle a number of details for us.)
#
# Rather than cache the depickled objects, which may
# be mutated by later people, we cache the file object
# data and depickle each time.
if fname not in self.cache:
fo = FileObj(fname, st, "rb")
data = fo.contents()
else:
data = self.cache[fname]
if not data:
# This might be: empty file, no-permissions file,
# and probably others. None are severe enough to
# kill us.
return None
ro = cPickle.loads(data)
if self.cache_on:
self.cache[fname] = data
return ro
#
# Storage is complicated by our desire to be atomic, or
# something close to it, so we don't overwrite the file
# in place but instead make a temporary one and then rename
# to the target name.
def store(self, value, zone, host, path, key = "default"):
self.validate_quad(zone, host, path, key)
key = key + '~'
dname = os.path.sep.join([self.root, zone, host, path])
fname = join2(dname, key)
# Serialize data. We tell dumps to use the best format
# it has, since we don't care about our caches being
# read by an older version.
data = cPickle.dumps(value, -1)
# Make the directory, which is zone + path.
try:
if not os.path.isdir(dname):
makedirs(dname)
except EnvironmentError as e:
return "could not make directory '%s': %s" % (dname, str(e))
# We write the data by a) making a tempfile,
# b) writing the data to it, c) renaming the
# tempfile to the real name.
# Arguably failing to make the cache file should not
# be a fatal error.
try:
stage = "making tempfile"
(fd, tname) = tempfile.mkstemp("XXXX~", ".store",
dname)
stage = "writing data"
os.write(fd, data)
os.close(fd)
# Personal twitch:
stage = "fixing permissions"
os.chmod(tname, 0644)
stage = "renaming to final name"
os.rename(tname, fname)
return None
except EnvironmentError as e:
return "error while %s for %s/%s/%s: %s" % (stage, zone, path, key, str(e))
# Return the timestamp of a cache entry or None if there is no such
# cache entry. This enables flag file based cache invalidation, or
# more exactly flag-quad based invalidation; store to the quad to
# update its time, then check the relatively time of the flag quad
# and your cache data.
def timeof(self, zone, host, path, key = "default"):
self.validate_quad(zone, host, path, key)
key = key + '~'
dname = os.path.sep.join([self.root, zone, host, path])
fname = join2(dname, key)
st = self.getStat(fname)
if not st:
return None
elif not stat.S_ISREG(st.st_mode):
raise derrors.IntErr("not a regular file in cachestore: '%s' '%s' '%s' '%s'" % (zone, host, path, key))
else:
return st.st_mtime