#!/usr/bin/python3 """ A python native parser with (many) missing features. Only support the bare minimum to extract symbols addresses and field offsets in structures Written from info found here: https://llvm.org/docs/PDB/index.html """ from math import ceil from struct import unpack from functools import cache, cached_property from uuid import UUID try: from line_profiler_pycharm import profile except ImportError: profile = lambda x: x def u32f(f, addr=None): if addr is not None: f.seek(addr) return unpack("= self.size: raise StopIteration return SectionHeaderStream.SectionHeader(self.read(40)) def __getitem__(self, section_index): if section_index >= self.NumberOfSections: raise ValueError(f"Section number {section_index} does not exist") self.cursor = section_index * 40 return SectionHeaderStream.SectionHeader(self.read(40)) class TPIorIPStream(MsfStream): """ struct TpiStreamHeader { uint32_t Version; uint32_t HeaderSize; uint32_t TypeIndexBegin; uint32_t TypeIndexEnd; uint32_t TypeRecordBytes; uint16_t HashStreamIndex; uint16_t HashAuxStreamIndex; uint32_t HashKeySize; uint32_t NumHashBuckets; int32_t HashValueBufferOffset; uint32_t HashValueBufferLength; int32_t IndexOffsetBufferOffset; uint32_t IndexOffsetBufferLength; int32_t HashAdjBufferOffset; uint32_t HashAdjBufferLength; }; """ REC_TYPES = { 0x1001: "LF_MODIFIER", 0x1002: "LF_POINTER", 0x1008: "LF_PROCEDURE", 0x1201: "LF_ARGLIST", 0x1203: "LF_FIELDLIST", 0x1205: "LF_BITFIELD", 0x1404: "LF_INDEX", 0x1502: "LF_ENUMERATE", 0x1503: "LF_ARRAY", 0x1505: "LF_STRUCTURE", 0x1506: "LF_UNION", 0x1507: "LF_ENUM", 0x150D: "LF_MEMBER", 0x1605: "LF_STRING_ID", 0x1606: "LF_UDT_SRC_LINE", } def __init__(self, msf, size, blocks): MsfStream.__init__(self, msf, size, blocks) self.filter = None self.type_index = self.TypeIndexBegin self.types = dict() self.REC_TYPES_ids = {self.REC_TYPES[k]: k for k in self.REC_TYPES} self.types_parsed = False @cached_property def HeaderSize(self): return self.u32(4) @cached_property def TypeIndexBegin(self): return self.u32(8) @cached_property def TypeRecordBytes(self): return self.u32(16) def skip_padding(self): b = self.u8() self.cursor -= 1 if b in (0xF1, 0xF2, 0xF3): padding_size = b & 0xF # assert b"\xF3\xF2\xF1".endswith(self.read(padding_size)) self.cursor += padding_size def unsigned(self): leaf = self.u16() if leaf < 0x8000: return leaf match leaf: case 0x8000: # LF_CHAR return self.u8() case 0x8002: # LF_SHORT return self.u16() case 0x8003 | 0x8004: # LF_LONG |LF_ULONG return self.u32() case 0x800A: # LF_SHORT return self.u64() case _: raise ValueError def __iter__(self): self.type_index = self.TypeIndexBegin self.cursor = self.HeaderSize return self def __next__(self): leaf_entry = None while leaf_entry is None: if self.cursor == self.size: self.types_parsed = True raise StopIteration if self.size - self.cursor < 4: raise ValueError record_length = self.u16() record_end = self.cursor + record_length if self.size < record_end: raise ValueError if self.filter is not None and self.peek_u16() not in self.filter: self.cursor = record_end self.type_index += 1 continue leaf_entry = self.parse_one_leaf_entry(record_end) self.types[self.type_index] = leaf_entry self.type_index += 1 if self.cursor > record_end: raise ValueError if self.cursor < record_end: end = self.read(record_end - self.cursor) if not b"\xf3\xf2\xf1".endswith(end): raise ValueError(f"Unparsed data: {end} for record {leaf_entry}") return leaf_entry def parse_one_leaf_entry(self, record_end): record_type = self.u16() if record_type not in self.REC_TYPES: raise ValueError(f"Record {hex(record_type)} not handled") match self.REC_TYPES.get(record_type, "???"): case "LF_MODIFIER": utype = self.u32() modifier = self.u16() record = (utype, modifier) case "LF_POINTER": utype = self.u32() attr = self.u32() if ((attr >> 5) & 7) in (2, 3): # ptrmode == Member or MemberFunction raise ValueError record = (utype, attr) case "LF_STRUCTURE": count = self.u16() properties = self.u16() has_unique_name = (properties & 0x200) != 0 fields = self.u32() derived_from = self.u32() vtable_shape = self.u32() size = self.unsigned() name = self.cstring() unique_name = self.cstring() if has_unique_name else None record = ( count, properties, fields, derived_from, vtable_shape, size, name, ) case "LF_FIELDLIST": fields = list() continuation = None while self.cursor < record_end: next_field = self.u16() if self.REC_TYPES[next_field] == "LF_INDEX": continuation = self.u32() else: self.cursor -= 2 fields.append(self.parse_one_leaf_entry(record_end)) self.skip_padding() record = (fields, continuation) case "LF_MEMBER": attributes = self.u16() field_type = self.u32() offset = self.unsigned() name = self.cstring() record = (attributes, field_type, offset, name) case "LF_ARGLIST": count = self.u32() arglist = [self.u32() for _ in range(count)] record = arglist case "LF_PROCEDURE": return_type = self.u32() attributes = self.u16() parameter_count = self.u16() argument_list = self.u32() record = (return_type, attributes, parameter_count, argument_list) case "LF_ARRAY": element_type = self.u32() indexing_type = self.u32() size = self.unsigned() pad = self.cstring() assert pad == b"" record = (element_type, indexing_type, size) case "LF_UNION": count = self.u16() properties = self.u16() has_unique_name = (properties & 0x200) != 0 fields = self.u32() size = self.unsigned() name = self.cstring() unique_name = self.cstring() if has_unique_name else None record = ( count, properties, fields, size, name, ) case "LF_ENUMERATE": attributes = self.u16() value = self.unsigned() name = self.cstring() record = (attributes, value, name) case "LF_ENUM": count = self.u16() properties = self.u16() has_unique_name = (properties & 0x200) != 0 underlying_type = self.u32() fields = self.u32() name = self.cstring() unique_name = self.cstring() if has_unique_name else None record = ( count, properties, underlying_type, fields, name, ) case "LF_BITFIELD": underlying_type = self.u32() length = self.u8() position = self.u8() record = (underlying_type, length, position) case _: record = () raise ValueError( f"Record {hex(record_type)} / {self.REC_TYPES.get(record_type, '???')} : not implemented" ) return self.REC_TYPES[record_type], record import io class Msf(object): def __init__(self, path=None, content=None): if content is not None: self.f = f = io.BytesIO(content) else: with open(path, "rb") as f_ondisk: self.f = f = io.BytesIO(f_ondisk.read()) FileMagic = f.read(32) assert FileMagic == b"Microsoft C/C++ MSF 7.00\r\n" + bytes.fromhex("1A 44 53 00 00 00") self.BlockSize = blockSize = u32f(f) self.FreeBlockMapBlock = u32f(f) self.NumBlocks = u32f(f) self.NumDirectoryBytes = u32f(f) self.Unknown = u32f(f) self.BlockMapAddr = u32f(f) self.StreamDirectory = MsfStreamDirectory(self) def __del__(self): self.f.close() @cache def Stream(self, stream_number): return MsfStream( self, self.StreamDirectory.StreamSize(stream_number), self.StreamDirectory.StreamBlocks(stream_number), ) class Pdb(Msf): @cached_property def PDBStream(self): return PdbInfoStream( self, self.StreamDirectory.StreamSize(1), self.StreamDirectory.StreamBlocks(1), ) @cached_property def DBIStream(self): return DBIStream( self, self.StreamDirectory.StreamSize(3), self.StreamDirectory.StreamBlocks(3), ) @cached_property def TPIStream(self): return TPIorIPStream( self, self.StreamDirectory.StreamSize(2), self.StreamDirectory.StreamBlocks(2), ) @cached_property def IPIStream(self): return TPIorIPStream( self, self.StreamDirectory.StreamSize(4), self.StreamDirectory.StreamBlocks(4), ) def get_field_offset(self, structname, fieldname): tpistream = self.TPIStream if not tpistream.types_parsed: save_filter = tpistream.filter tpistream.filter = [ tpistream.REC_TYPES_ids["LF_FIELDLIST"], tpistream.REC_TYPES_ids["LF_STRUCTURE"], ] for _ in tpistream: pass tpistream.filter = save_filter structname = structname.encode() for struct_id, t in tpistream.types.items(): if t[0] == "LF_STRUCTURE": if t[1][2] != 0 and t[1][6] == structname: break else: raise ValueError(f"Structure {structname} not found in PDB") fieldlist_id = t[1][2] fieldlist = tpistream.types[fieldlist_id][1][0] fieldname = fieldname.encode() for field in fieldlist: if fieldname == field[1][3]: break else: raise ValueError(f"Field {fieldname} not found in structure {structname}") field_offset = field[1][2] return field_offset def get_symbol_offset(self, symbol: str) -> int: offset, segment = self.DBIStream.SymRecordStream.search_and_cache_symbols(symbol) if offset == segment == None: return None section_virtual_address = self.DBIStream.SectionHeadersStream[segment - 1].VirtualAddress return section_virtual_address + offset