look - display lines beginning with a given string
DESCRIPTION
The look utility displays any lines in file which contain string. As look performs a binary search, the lines in file must be sorted (where sort(1) was given the same options -d and/or -f that look is invoked with).
example:
justin@box:~/data$ time look $(echo -n secret123 | sha1sum | cut -d ' ' -f 1 | tr a-z A-Z) pwned-passwords-sha1-ordered-by-hash-v6.txt
F2B14F68EB995FACB3A1C35287B778D5BD785511:17384
real 0m0.212s
user 0m0.005s
sys 0m0.001s
justin@box:~/data$ time look $(echo -n secret123 | sha1sum | cut -d ' ' -f 1 | tr a-z A-Z) pwned-passwords-sha1-ordered-by-hash-v6.txt
F2B14F68EB995FACB3A1C35287B778D5BD785511:17384
real 0m0.002s
user 0m0.003s
sys 0m0.001s
You can make python binary search super fast if you use mmap. here's a version of that I had lying around, it's probably correct.
import os
import mmap
def do_mmap(f):
fd = os.open(f, os.O_RDONLY)
size = os.lseek(fd, 0, 2)
os.lseek(fd, 0, 0)
m = mmap.mmap(fd, size, prot=mmap.PROT_READ)
return m, size, fd
SEEK_SET = 0
SEEK_CUR = 1
class Searcher:
def __init__(self, file):
self.file = file
self.map, self.size, self.fd = do_mmap(file)
def close(self):
self.map.close()
os.close(self.fd)
def find_newline(self):
self.map.readline()
return self.map.tell()
def binary_search(self, q):
pos = 0
start = 0
end = self.size
found = False
#this can get stuck with start = xxx and end = xxx+1, probably from the \r\n
while start < end - 2:
mid = start + (end-start)//2
self.map.seek(mid)
pos = self.find_newline()
if pos > end:
break
line = self.map.readline()
if q < line:
end = mid
elif q > line:
start = mid
while True:
line = self.map.readline()
if not line.startswith(q): break
yield line
if __name__ == "__main__":
import sys
q = sys.argv[1]
s = Searcher("pwned-passwords-sha1-ordered-by-hash-v6.txt")
import time
ss = time.perf_counter()
res = s.binary_search(q.upper().encode())
for x in res:
print(x)
ee = time.perf_counter()
print(ee-ss)
if have 10 computers put every 10th line in its own file, if each file is 1000 lines put line 500 at the start, then line 250, then line 750, then line 125, 375 etc
What would be the fastest way using *nix commands? A naive solution would be something like: