-
Notifications
You must be signed in to change notification settings - Fork 0
/
skim
executable file
·154 lines (123 loc) · 5.05 KB
/
skim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python
"""
skim: A file reader that skips lines efficiently
Copyright (c) 2009, Richard Clark, Red Spider Limited <[email protected]>
Contributers include:
Mark Kirkwood (original python implementation)
Stephen J (Some script boilerplate code adopted from his NZPUG presentation)
See LICENSE file for details.
WARNING: This program is still being developed. The accuracy of the
average line length estimator means you really need a big, relatively consistent
file for it to work properly (ie, a few GB of apache logs). The stream-based
mode (cat file by pipe) is much slower but completely accurate since it has
to read everything anyway instead of seeking.
TODO: If you feel like hacking on the code, there are a few useful things
* Improve the line estimator so that the test suite gets exactly 100 without
hacks.
* Improve the code structure so it looks prettier
* Create a benchmark runner so that we can compare performance to:
* Original python implementation
* C implementations
* wc -l
* cat
* Add a switch to force non-seek mode and hints in the help suggesting that if
you've got short line lengths and only short jumps, there's no benefit to
seek mode
* Add a switch to request random-sized jumps (ie, a 100 line skip would be
random.randint(0, 200)) to improve random distribution.
* Add a switch to request disk-optimised jumps (ie, a 100 line skip would
read say, 4 lines in a row, then skip 400 lines, to reduce the blocks read
from the disk)
* Create packaging script for debian/ubuntu
* Create man page
"""
import os, sys, random, logging
from optparse import OptionParser
WHENCE_RELATIVE = 1
logging.basicConfig(level=logging.WARN)
log = logging.getLogger('Skim')
def skim_step_seekable(average_length, lines, file):
""" Steps forward using sneaky seek-based method """
# Seek forward average_length * lines bytes
log.debug("Seeking %d forward", average_length*lines)
file.seek(average_length*lines, WHENCE_RELATIVE)
# Try and read a line
c = file.read(1)
if c != '\n':
# Part way through a line, skip it
log.debug("Skipping part-line")
file.readline()
# We use file.tell() to determine the length because we're after
# line lengths in bytes, not characters. Might differ in UTF16 etc.
# Might be worth some research here to determine if we can get an
# accurate byte count out of the line instead.
start = file.tell()
log.debug("Starting line read at %d", start)
line = file.readline()
log.debug("Read '%s'", line)
end = file.tell()
log.debug("Line read finished at %d", end)
if start == end:
# If the file has ended, we're done, break
log.debug("File end encountered, exiting")
return (None, 0)
return (line, end-start)
def skim_step_stream(lines, file):
""" Steps forward using pedestrian line reading method """
for n in range(0, lines):
file.readline()
line = file.readline()
return (line, len(line))
def skim_generator(lines, file):
""" Returns a generator for skimming lines in this file """
total_length = 0
count = 0
seekable = True
# Try and seek in the file. If it's a stream, we can't do it
try:
file.seek(0, WHENCE_RELATIVE)
except IOError, e:
seekable = False
log.debug("File is not seekable, falling back to reading")
if seekable:
# Cool seek-jump method
while True:
(line, length) = skim_step_seekable(count and (total_length/count) or 80, lines, file)
if length == 0:
# Empty (including no newline). We're done
return
count += 1
total_length += length
yield line
else:
# Sad, old-school readline method
while True:
(line, length) = skim_step_stream(lines, file)
if length == 0:
# Empty (including no newline). We're done
return
yield line
def skim(lines, file):
""" Skim through the provided file printing out one line in every (lines) """
for line in skim_generator(lines, file):
sys.stdout.write(line)
if __name__ == "__main__":
usage = "usage: %prog [options] [filename]"
parser = OptionParser(usage="usage: %prog [options] filename")
parser.add_option("--verbose","-v",
help = "print debugging output",
action = "store_true")
parser.add_option("--lines","-l",
help = "number of lines to skip",
type = "int",
default = 100,
action = "store")
(options, args) = parser.parse_args()
if options.verbose:
log.setLevel(logging.DEBUG)
log.debug("Verbose mode: %s" % options.verbose)
log.debug("Lines to skip: %d" % options.lines)
if len(args) > 0 and args[0] != '-':
skim(options.lines, open(args[0],'rb'))
else:
skim(options.lines, sys.stdin)