protobuf_rev/binary_reader.py at main · InkeyP/protobuf_rev · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""
binary_reader.py - 二进制文件抽象层
支持 ELF (Linux) 和 PE (Windows), 32/64-bit
"""

import struct
import re
from pathlib import Path


class BinaryReader:
    """统一的二进制文件读取接口，支持 ELF/PE, 32/64-bit"""

    def __init__(self, filepath: str):
        self.filepath = Path(filepath)
        self.data = self.filepath.read_bytes()
        self.size = len(self.data)

        # 检测文件格式
        self.format = self._detect_format()
        self.bits = self._detect_bits()
        self.endian = self._detect_endian()
        self.ptr_size = self.bits // 8
        self.ptr_fmt = self._ptr_struct_fmt()

        # 解析节区信息
        self._sections = self._parse_sections()

        # 缓存虚拟地址映射
        self._va_mappings = self._build_va_mappings()

    def _detect_format(self) -> str:
        if self.data[:4] == b'\x7fELF':
            return 'elf'
        elif self.data[:2] == b'MZ':
            return 'pe'
        else:
            return 'raw'

    def _detect_bits(self) -> int:
        if self.format == 'elf':
            ei_class = self.data[4]
            return 64 if ei_class == 2 else 32
        elif self.format == 'pe':
            pe_offset = struct.unpack_from('<I', self.data, 0x3C)[0]
            machine = struct.unpack_from('<H', self.data, pe_offset + 4)[0]
            return 64 if machine == 0x8664 else 32
        return 64  # default

    def _detect_endian(self) -> str:
        if self.format == 'elf':
            ei_data = self.data[5]
            return 'big' if ei_data == 2 else 'little'
        return 'little'  # PE is always little-endian

    def _ptr_struct_fmt(self) -> str:
        prefix = '>' if self.endian == 'big' else '<'
        return f'{prefix}Q' if self.bits == 64 else f'{prefix}I'

    def _parse_sections(self) -> list:
        """解析所有节区信息"""
        sections = []
        if self.format == 'elf':
            sections = self._parse_elf_sections()
        elif self.format == 'pe':
            sections = self._parse_pe_sections()
        return sections

    def _parse_elf_sections(self) -> list:
        """解析 ELF 节区"""
        sections = []
        e = '<' if self.endian == 'little' else '>'

        if self.bits == 64:
            e_shoff = struct.unpack_from(f'{e}Q', self.data, 0x28)[0]
            e_shentsize = struct.unpack_from(f'{e}H', self.data, 0x3A)[0]
            e_shnum = struct.unpack_from(f'{e}H', self.data, 0x3C)[0]
            e_shstrndx = struct.unpack_from(f'{e}H', self.data, 0x3E)[0]
        else:
            e_shoff = struct.unpack_from(f'{e}I', self.data, 0x20)[0]
            e_shentsize = struct.unpack_from(f'{e}H', self.data, 0x2E)[0]
            e_shnum = struct.unpack_from(f'{e}H', self.data, 0x30)[0]
            e_shstrndx = struct.unpack_from(f'{e}H', self.data, 0x32)[0]

        if e_shoff == 0 or e_shnum == 0:
            return sections

        # 读取节名字符串表
        if self.bits == 64:
            strtab_off = struct.unpack_from(f'{e}Q', self.data, e_shoff + e_shstrndx * e_shentsize + 0x18)[0]
        else:
            strtab_off = struct.unpack_from(f'{e}I', self.data, e_shoff + e_shstrndx * e_shentsize + 0x10)[0]

        for i in range(e_shnum):
            offset = e_shoff + i * e_shentsize
            if self.bits == 64:
                sh_name_idx = struct.unpack_from(f'{e}I', self.data, offset)[0]
                sh_type = struct.unpack_from(f'{e}I', self.data, offset + 4)[0]
                sh_addr = struct.unpack_from(f'{e}Q', self.data, offset + 0x10)[0]
                sh_offset = struct.unpack_from(f'{e}Q', self.data, offset + 0x18)[0]
                sh_size = struct.unpack_from(f'{e}Q', self.data, offset + 0x20)[0]
            else:
                sh_name_idx = struct.unpack_from(f'{e}I', self.data, offset)[0]
                sh_type = struct.unpack_from(f'{e}I', self.data, offset + 4)[0]
                sh_addr = struct.unpack_from(f'{e}I', self.data, offset + 0x0C)[0]
                sh_offset = struct.unpack_from(f'{e}I', self.data, offset + 0x10)[0]
                sh_size = struct.unpack_from(f'{e}I', self.data, offset + 0x14)[0]

            name = self._read_cstring_at(strtab_off + sh_name_idx)
            sections.append({
                'name': name,
                'type': sh_type,
                'vaddr': sh_addr,
                'offset': sh_offset,
                'size': sh_size,
            })
        return sections

    def _parse_pe_sections(self) -> list:
        """解析 PE 节区"""
        sections = []
        pe_offset = struct.unpack_from('<I', self.data, 0x3C)[0]
        num_sections = struct.unpack_from('<H', self.data, pe_offset + 6)[0]
        optional_header_size = struct.unpack_from('<H', self.data, pe_offset + 0x14)[0]
        section_start = pe_offset + 0x18 + optional_header_size

        # 获取 ImageBase
        if self.bits == 64:
            self._image_base = struct.unpack_from('<Q', self.data, pe_offset + 0x30)[0]
        else:
            self._image_base = struct.unpack_from('<I', self.data, pe_offset + 0x34)[0]

        for i in range(num_sections):
            off = section_start + i * 40
            name_bytes = self.data[off:off + 8]
            name = name_bytes.split(b'\x00')[0].decode('ascii', errors='replace')
            vsize = struct.unpack_from('<I', self.data, off + 8)[0]
            vaddr = struct.unpack_from('<I', self.data, off + 12)[0]
            raw_size = struct.unpack_from('<I', self.data, off + 16)[0]
            raw_offset = struct.unpack_from('<I', self.data, off + 20)[0]

            sections.append({
                'name': name,
                'vaddr': vaddr + getattr(self, '_image_base', 0),
                'offset': raw_offset,
                'size': raw_size,
                'vsize': vsize,
                'raw_vaddr': vaddr,
            })
        return sections

    def _build_va_mappings(self) -> list:
        """构建虚拟地址到文件偏移的映射"""
        mappings = []

        if self.format == 'elf':
            e = '<' if self.endian == 'little' else '>'
            if self.bits == 64:
                e_phoff = struct.unpack_from(f'{e}Q', self.data, 0x20)[0]
                e_phentsize = struct.unpack_from(f'{e}H', self.data, 0x36)[0]
                e_phnum = struct.unpack_from(f'{e}H', self.data, 0x38)[0]
            else:
                e_phoff = struct.unpack_from(f'{e}I', self.data, 0x1C)[0]
                e_phentsize = struct.unpack_from(f'{e}H', self.data, 0x2A)[0]
                e_phnum = struct.unpack_from(f'{e}H', self.data, 0x2C)[0]

            for i in range(e_phnum):
                off = e_phoff + i * e_phentsize
                if self.bits == 64:
                    p_type = struct.unpack_from(f'{e}I', self.data, off)[0]
                    p_offset = struct.unpack_from(f'{e}Q', self.data, off + 0x08)[0]
                    p_vaddr = struct.unpack_from(f'{e}Q', self.data, off + 0x10)[0]
                    p_filesz = struct.unpack_from(f'{e}Q', self.data, off + 0x20)[0]
                else:
                    p_type = struct.unpack_from(f'{e}I', self.data, off)[0]
                    p_offset = struct.unpack_from(f'{e}I', self.data, off + 0x04)[0]
                    p_vaddr = struct.unpack_from(f'{e}I', self.data, off + 0x08)[0]
                    p_filesz = struct.unpack_from(f'{e}I', self.data, off + 0x10)[0]

                if p_type == 1:  # PT_LOAD
                    mappings.append((p_vaddr, p_offset, p_filesz))

        elif self.format == 'pe':
            image_base = getattr(self, '_image_base', 0)
            for sec in self._sections:
                mappings.append((
                    sec['raw_vaddr'] + image_base,
                    sec['offset'],
                    sec['size']
                ))

        return mappings

    def va_to_offset(self, va: int) -> int:
        """虚拟地址转文件偏移"""
        for seg_va, seg_off, seg_size in self._va_mappings:
            if seg_va <= va < seg_va + seg_size:
                return va - seg_va + seg_off
        # fallback: 直接返回 (对于 raw 格式)
        return va

    def offset_to_va(self, offset: int) -> int:
        """文件偏移转虚拟地址"""
        for seg_va, seg_off, seg_size in self._va_mappings:
            if seg_off <= offset < seg_off + seg_size:
                return offset - seg_off + seg_va
        return offset

    def read_bytes(self, offset: int, size: int) -> bytes:
        """从文件偏移处读取字节"""
        if offset < 0 or offset + size > self.size:
            return b'\x00' * size
        return self.data[offset:offset + size]

    def read_bytes_va(self, va: int, size: int) -> bytes:
        """从虚拟地址读取字节"""
        return self.read_bytes(self.va_to_offset(va), size)

    def read_ptr(self, offset: int) -> int:
        """从文件偏移处读取一个指针"""
        data = self.read_bytes(offset, self.ptr_size)
        return struct.unpack(self.ptr_fmt, data)[0]

    def read_ptr_va(self, va: int) -> int:
        """从虚拟地址读取一个指针"""
        return self.read_ptr(self.va_to_offset(va))

    def read_u32(self, offset: int) -> int:
        e = '>' if self.endian == 'big' else '<'
        return struct.unpack_from(f'{e}I', self.data, offset)[0]

    def read_u32_va(self, va: int) -> int:
        return self.read_u32(self.va_to_offset(va))

    def read_u64(self, offset: int) -> int:
        e = '>' if self.endian == 'big' else '<'
        return struct.unpack_from(f'{e}Q', self.data, offset)[0]

    def _read_cstring_at(self, offset: int, max_len: int = 4096) -> str:
        """从文件偏移处读取 C 字符串"""
        if offset < 0 or offset >= self.size:
            return ''
        end = self.data.find(b'\x00', offset, min(offset + max_len, self.size))
        if end == -1:
            end = min(offset + max_len, self.size)
        try:
            return self.data[offset:end].decode('utf-8')
        except UnicodeDecodeError:
            return self.data[offset:end].decode('latin-1')

    def read_cstring(self, offset: int) -> str:
        """从文件偏移处读取 C 字符串"""
        return self._read_cstring_at(offset)

    def read_cstring_va(self, va: int) -> str:
        """从虚拟地址读取 C 字符串"""
        return self._read_cstring_at(self.va_to_offset(va))

    def search(self, pattern: bytes, start: int = 0) -> list:
        """在整个文件中搜索字节模式，返回所有文件偏移"""
        results = []
        pos = start
        while True:
            pos = self.data.find(pattern, pos)
            if pos == -1:
                break
            results.append(pos)
            pos += 1
        return results

    def search_regex(self, pattern: bytes, start: int = 0) -> list:
        """正则搜索，返回所有 (offset, match) 对"""
        results = []
        for m in re.finditer(pattern, self.data[start:]):
            results.append((start + m.start(), m.group()))
        return results

    def get_section(self, name: str) -> dict | None:
        """按名称获取节区"""
        for sec in self._sections:
            if sec['name'] == name:
                return sec
        return None

    def get_section_data(self, name: str) -> bytes:
        """获取节区数据"""
        sec = self.get_section(name)
        if sec:
            return self.data[sec['offset']:sec['offset'] + sec['size']]
        return b''

    @property
    def info(self) -> str:
        fmt = self.format.upper()
        return f"{fmt} {self.bits}-bit {self.endian}-endian, {self.size} bytes, {len(self._sections)} sections"