-
Notifications
You must be signed in to change notification settings - Fork 157
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
data: Replace custom formats with msgpack #374
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,8 @@ | |
import os.path | ||
import errno | ||
|
||
import msgpack._cmsgpack | ||
|
||
deflist_regex = re.compile(b'(\d*)(\w)(\d*)(\w),?') | ||
deflist_macro_regex = re.compile('\dM\d+(\w)') | ||
|
||
|
@@ -48,6 +50,36 @@ | |
|
||
defTypeD = {v: k for k, v in defTypeR.items()} | ||
|
||
defTypeToInt = { | ||
'config': 0, | ||
'define': 1, | ||
'enum': 2, | ||
'enumerator': 3, | ||
'function': 4, | ||
'label': 5, | ||
'macro': 6, | ||
'member': 7, | ||
'prototype': 8, | ||
'struct': 9, | ||
'typedef': 10, | ||
'union': 11, | ||
'variable': 12, | ||
'externvar': 13 | ||
} | ||
|
||
intToDefType = {v: k for k, v in defTypeToInt.items()} | ||
|
||
familyToInt = { | ||
'A': 0, | ||
'B': 1, | ||
'C': 2, | ||
'D': 3, | ||
'K': 4, | ||
'M': 5, | ||
} | ||
|
||
intToFamily = {v: k for k, v in familyToInt.items()} | ||
|
||
################################################################################## | ||
|
||
maxId = 999999999 | ||
|
@@ -56,93 +88,97 @@ class DefList: | |
'''Stores associations between a blob ID, a type (e.g., "function"), | ||
a line number and a file family. | ||
Also stores in which families the ident exists for faster tests.''' | ||
def __init__(self, data=b'#'): | ||
self.data, self.families = data.split(b'#') | ||
def __init__(self, data: bytes | None = None): | ||
if data is not None: | ||
parsed_data = msgpack.loads(data) | ||
self.entries = parsed_data[0] | ||
self.families = parsed_data[1] | ||
else: | ||
self.entries = [] | ||
self.families = "" | ||
|
||
def iter(self, dummy=False): | ||
# Get all element in a list of sublists and sort them | ||
entries = deflist_regex.findall(self.data) | ||
entries.sort(key=lambda x:int(x[0])) | ||
for id, type, line, family in entries: | ||
id = int(id) | ||
type = defTypeR [type.decode()] | ||
line = int(line) | ||
family = family.decode() | ||
yield id, type, line, family | ||
# return ((id, defTypeR[type], line, family) for (id, type, line, family) in self.data) | ||
|
||
self.entries.sort(key=lambda x: x[0]) | ||
|
||
for id, type, line, family in self.entries: | ||
yield id, intToDefType[type], line, intToFamily[family] | ||
|
||
if dummy: | ||
yield maxId, None, None, None | ||
|
||
def append(self, id, type, line, family): | ||
def append(self, id: int, type: str, line: int, family: str): | ||
# if family not in self.family: self.family.append(family) | ||
# self.data.append((id, defTypeD[type], line, family)) | ||
|
||
if type not in defTypeD: | ||
return | ||
p = str(id) + defTypeD[type] + str(line) + family | ||
if self.data != b'': | ||
p = ',' + p | ||
self.data += p.encode() | ||
self.add_family(family) | ||
|
||
def pack(self): | ||
return self.data + b'#' + self.families | ||
self.entries.append((id, defTypeToInt[type], line, familyToInt[family])) | ||
|
||
def add_family(self, family): | ||
family = family.encode() | ||
if not family in self.families.split(b','): | ||
if self.families != b'': | ||
family = b',' + family | ||
if family not in self.families: | ||
self.families += family | ||
|
||
def pack(self): | ||
return msgpack.dumps([self.entries, self.families]) | ||
|
||
def get_families(self): | ||
return self.families.decode().split(',') | ||
return self.families | ||
|
||
def get_macros(self): | ||
return deflist_macro_regex.findall(self.data.decode()) or '' | ||
return [intToFamily[family] for _, typ, _, family in self.entries if typ == defTypeToInt['macro']] | ||
|
||
class PathList: | ||
'''Stores associations between a blob ID and a file path. | ||
Inserted by update.py sorted by blob ID.''' | ||
def __init__(self, data=b''): | ||
self.data = data | ||
def __init__(self, data: bytes | None=None): | ||
if data is not None: | ||
# [(id, path)] | ||
self.data = msgpack.loads(data) | ||
else: | ||
self.data = [] | ||
|
||
def iter(self, dummy=False): | ||
for p in self.data.split(b'\n')[:-1]: | ||
id, path = p.split(b' ',maxsplit=1) | ||
id = int(id) | ||
path = path.decode() | ||
for id, path in self.data: | ||
yield id, path | ||
if dummy: | ||
yield maxId, None | ||
|
||
def append(self, id, path): | ||
p = str(id).encode() + b' ' + path + b'\n' | ||
self.data += p | ||
def append(self, id: int, path: str): | ||
self.data.append((id, path)) | ||
|
||
def pack(self): | ||
return self.data | ||
return msgpack.dumps(self.data) | ||
|
||
class RefList: | ||
'''Stores a mapping from blob ID to list of lines | ||
and the corresponding family.''' | ||
def __init__(self, data=b''): | ||
self.data = data | ||
def __init__(self, data=None): | ||
# {(blob_id, family): [line]} | ||
if data is not None: | ||
self.data = msgpack.loads(data, strict_map_key=False) | ||
else: | ||
self.data = {} | ||
|
||
def iter(self, dummy=False): | ||
# Split all elements in a list of sublists and sort them | ||
entries = [x.split(b':') for x in self.data.split(b'\n')[:-1]] | ||
entries.sort(key=lambda x:int(x[0])) | ||
for b, c, d in entries: | ||
b = int(b.decode()) | ||
c = c.decode() | ||
d = d.decode() | ||
yield b, c, d | ||
for id, family_dict in self.data.items(): | ||
for family, lines in family_dict.items(): | ||
yield id, lines, family | ||
if dummy: | ||
yield maxId, None, None | ||
|
||
def append(self, id, lines, family): | ||
p = str(id) + ':' + lines + ':' + family + '\n' | ||
self.data += p.encode() | ||
if id not in self.data: | ||
self.data[id] = {} | ||
if family not in self.data[id]: | ||
self.data[id][family] = [] | ||
|
||
self.data[id][family] += lines | ||
|
||
def pack(self): | ||
return self.data | ||
return msgpack.dumps(self.data) | ||
|
||
class BsdDB: | ||
def __init__(self, filename, readonly, contentType, shared=False): | ||
|
@@ -158,24 +194,43 @@ def __init__(self, filename, readonly, contentType, shared=False): | |
self.db.open(filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) | ||
self.ctype = contentType | ||
|
||
def exists(self, key): | ||
key = autoBytes(key) | ||
def exists(self, key: str|bytes|int): | ||
if type(key) is str: | ||
key = key.encode() | ||
elif type(key) is int: | ||
key = msgpack.dumps(key) | ||
|
||
return self.db.exists(key) | ||
|
||
def get(self, key): | ||
key = autoBytes(key) | ||
def get(self, key: str|bytes|int): | ||
if type(key) is str: | ||
key = key.encode() | ||
elif type(key) is int: | ||
key = msgpack.dumps(key) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to do this? Isn't it rather an error if someone gives us a string or int? A key is of type bytes, callers that don't respect that have a bug IMO. |
||
|
||
p = self.db.get(key) | ||
p = self.ctype(p) | ||
return p | ||
if p is not None: | ||
if self.ctype is None: | ||
return msgpack.loads(p) | ||
else: | ||
return self.ctype(p) | ||
else: | ||
return None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Early return the |
||
|
||
def get_keys(self): | ||
return self.db.keys() | ||
|
||
def put(self, key, val, sync=False): | ||
key = autoBytes(key) | ||
val = autoBytes(val) | ||
if type(val) is not bytes: | ||
def put(self, key: str|bytes|int, val, sync=False): | ||
if type(key) is str: | ||
key = key.encode() | ||
elif type(key) is int: | ||
key = msgpack.dumps(key) | ||
|
||
if self.ctype is None: | ||
val = msgpack.dumps(val) | ||
else: | ||
val = val.pack() | ||
|
||
self.db.put(key, val) | ||
if sync: | ||
self.db.sync() | ||
|
@@ -192,13 +247,13 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False): | |
|
||
ro = readonly | ||
|
||
self.vars = BsdDB(dir + '/variables.db', ro, lambda x: int(x.decode()), shared=shared) | ||
self.vars = BsdDB(dir + '/variables.db', ro, shared=shared) | ||
# Key-value store of basic information | ||
self.blob = BsdDB(dir + '/blobs.db', ro, lambda x: int(x.decode()), shared=shared) | ||
self.blob = BsdDB(dir + '/blobs.db', ro, shared=shared) | ||
# Map hash to sequential integer serial number | ||
self.hash = BsdDB(dir + '/hashes.db', ro, lambda x: x, shared=shared) | ||
self.hash = BsdDB(dir + '/hashes.db', ro, shared=shared) | ||
# Map serial number back to hash | ||
self.file = BsdDB(dir + '/filenames.db', ro, lambda x: x.decode(), shared=shared) | ||
self.file = BsdDB(dir + '/filenames.db', ro, shared=shared) | ||
# Map serial number to filename | ||
self.vers = BsdDB(dir + '/versions.db', ro, PathList, shared=shared) | ||
self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
import sys | ||
import logging | ||
import subprocess, os | ||
import msgpack._cmsgpack | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -189,9 +190,9 @@ def isIdent(bstr): | |
|
||
def autoBytes(arg): | ||
if type(arg) is str: | ||
arg = arg.encode() | ||
arg = msgpack.dumps(arg) | ||
elif type(arg) is int: | ||
arg = str(arg).encode() | ||
arg = msgpack.dumps(arg) | ||
return arg | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment: is this used? Shouldn't callers know what they have and do the right thing themselves. |
||
|
||
def getDataDir(): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,7 @@ function generateSymbolDefinitionsHTML(symbolDefinitions, project, version) { | |
result += '<ul>'; | ||
previous_type = sd.type; | ||
} | ||
let ln = sd.line.toString().split(','); | ||
let ln = [sd.line]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is that related to msgpack? |
||
if (ln.length == 1) { | ||
let n = ln[0]; | ||
result += `<li><a href="/${project}/${version}/source/${sd.path}#L${n}"><strong>${sd.path}</strong>, line ${n} <em>(as a ${sd.type})</em></a>`; | ||
|
@@ -87,7 +87,7 @@ function generateSymbolReferencesHTML(symbolReferences, project, version) { | |
result += '<h2>Referenced in ' + symbolReferences.length.toString() + ' files:</h2>'; | ||
result += '<ul>'; | ||
for (let sr of symbolReferences) { | ||
let ln = sr.line.split(','); | ||
let ln = sr.line; | ||
if (ln.length == 1) { | ||
let n = ln[0]; | ||
result += `<li><a href="/${project}/${version}/source/${sr.path}#L${n}"><strong>${sr.path}</strong>, line ${n}</a>`; | ||
|
@@ -117,7 +117,7 @@ function generateDocCommentsHTML(symbolDocComments, project, version) { | |
result += '<h2>Documented in ' + symbolDocComments.length.toString() + ' files:</h2>'; | ||
result += '<ul>'; | ||
for(let sd of symbolDocComments) { | ||
let ln = sd.line.split(','); | ||
let ln = sd.line; | ||
if(ln.length == 1) { | ||
let n = ln[0]; | ||
result += `<li><a href="/${project}/${version}/source/${sd.path}#L${n}"><strong>${sd.path}</strong>, line ${n}</a>`; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we work only on the raw data, and parsing it when things are requested? Goal is to store only a bytes buffer without taking loads of memory if we want to have loads in memory.