PYC文件格式

PYC文件格式

利用010 editor的Template,可以很好得了解PYC文件的格式.

1
2
3
4
5
struct {
Magic magic;
char mtime[4];
r_object data;
} file

4字节魔数
4字节编译时间
对象数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
typedef struct r_object {
ObjType type;
switch (type) {
case TYPE_NULL:
case TYPE_NONE:
case TYPE_STOPITER:
case TYPE_ELLIPSIS:
case TYPE_FALSE:
case TYPE_TRUE:
break;
case TYPE_INT:
r_long value;
break;
case TYPE_INT64:
r_long64 value;
break;
case TYPE_LONG:
r_long n;
local int size = n<0?-n:n;
r_short digit[size];
break;
case TYPE_FLOAT:
r_byte n;
char value[n];
break;
case TYPE_BINARY_FLOAT:
double value;
break;
case TYPE_COMPLEX:
r_byte nr;
char real[nr];
r_byte ni;
char imag[ni];
break;
case TYPE_BINARY_COMPLEX:
double real;
double imag;
break;

case TYPE_INTERNED:
case TYPE_STRING:
r_long n;
if (n)
char str[n];
break;
case TYPE_STRINGREF:
r_long n;
break;
case TYPE_TUPLE:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;
case TYPE_LIST:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;
case TYPE_DICT:
while (1) {
struct r_object key;
if (key.type == TYPE_NULL)
break;
struct r_object val;
}
break;
case TYPE_SET:
case TYPE_FROZENSET:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;
case TYPE_CODE:
r_long argcount;
r_long nlocals;
r_long stacksize;
r_long flags;
//struct r_object code;
Code code;
struct r_object consts;
struct r_object names;
struct r_object varnames;
struct r_object freevars;
struct r_object cellvars;
struct r_object filename;
struct r_object name;
r_long firstlineno;
//struct r_object lnotab;
LnoTab lnotab;
break;
default:
Warning("unknown type code");
Exit(1);
}
} r_object;

每个对象数据头一个字节指明对象类型
类型及对应的类型码,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
enum <char> ObjType {
TYPE_NULL = '0',
TYPE_NONE = 'N',
TYPE_FALSE = 'F',
TYPE_TRUE = 'T',
TYPE_STOPITER = 'S',
TYPE_ELLIPSIS = '.',
TYPE_INT = 'i',
TYPE_INT64 = 'I',
TYPE_FLOAT = 'f',
TYPE_BINARY_FLOAT = 'g',
TYPE_COMPLEX = 'x',
TYPE_BINARY_COMPLEX = 'y',
TYPE_LONG = 'l',
TYPE_STRING = 's',
TYPE_INTERNED = 't',
TYPE_STRINGREF = 'R',
TYPE_TUPLE = '(',
TYPE_LIST = '[',
TYPE_DICT = '{',
TYPE_CODE = 'c',
TYPE_UNICODE = 'u',
TYPE_UNKNOWN = '?',
TYPE_SET = '<',
TYPE_FROZENSET = '>',
};

每个不同类型的对象有不同结构,根据ObjType来找到对应r_object。

1
2
3
4
5
6
7
case TYPE_NULL:
case TYPE_NONE:
case TYPE_STOPITER:
case TYPE_ELLIPSIS:
case TYPE_FALSE:
case TYPE_TRUE:
break;

这几种类型无后序数据

1
2
3
case TYPE_INT:
r_long value;
break;

整数型:4字节数据

1
2
3
case TYPE_INT64:
r_long64 value;
break;

长整型:8字节数据

1
2
3
4
5
case TYPE_LONG:
r_long n;
local int size = n<0?-n:n;
r_short digit[size];
break;

无符号长整型:
4字节指明长度
2*size字节数据

1
2
3
4
case TYPE_FLOAT:
r_byte n;
char value[n];
break;

浮点型:
1字节 长度
n字节 值

1
2
3
case TYPE_BINARY_FLOAT:
double value;
break;

二进制浮点型:
8字节 数据值

1
2
3
4
5
6
case TYPE_COMPLEX:
r_byte nr;
char real[nr];
r_byte ni;
char imag[ni];
break;

复数:
1字节 实部长度
nr字节 实部值
1字节 虚部长度
ni字节 虚部值

1
2
3
4
case TYPE_BINARY_COMPLEX:
double real;
double imag;
break;

二进制复数:
8字节 实部值
8字节 虚部值

1
2
3
4
5
6
case TYPE_INTERNED:
case TYPE_STRING:
r_long n;
if (n)
char str[n];
break;

字符串:
4字节 长度
n字节 字符

1
2
3
case TYPE_STRINGREF:
r_long n;
break;

字符串引用:
4字节长度

1
2
3
4
5
case TYPE_TUPLE:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;

元组:
4字节长度
n 个r_object元素(n字节)

1
2
3
4
5
case TYPE_LIST:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;

列表:
4字节长度
n 个r_object元素(n字节)

1
2
3
4
5
6
7
8
case TYPE_DICT:
while (1) {
struct r_object key;
if (key.type == TYPE_NULL)
break;
struct r_object val;
}
break;

字典:
r_object key和r_object val为一个字典元素
以类型为NULL的r_oject的元素截止

1
2
3
4
5
6
7
case TYPE_SET:
case TYPE_FROZENSET:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;

集合:
4字节长度
n 个r_object元素(n字节)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
case TYPE_CODE:
r_long argcount;
r_long nlocals;
r_long stacksize;
r_long flags;
//struct r_object code;
Code code;
struct r_object consts;
struct r_object names;
struct r_object varnames;
struct r_object freevars;
struct r_object cellvars;
struct r_object filename;
struct r_object name;
r_long firstlineno;
//struct r_object lnotab;
LnoTab lnotab;
break;

argument:4字节全局code block的位置参数个数
nlocals:4字节全局code block中的局部变量个数
stacksize:4字节code block需要的栈空间
flags:4字节

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
struct Code {
ObjType type;
if (type != TYPE_STRING) {
Warning("code not in string type");
Exit(1);
}
r_long n;
local int remain = n;
local int end = FTell() + n;

/* trick to optimize parse speed */
while (remain >= 6) {
Instruction inst[remain/6] <read=ReadInstruction,optimize=false>;
remain = end - FTell();
}
remain = end - FTell();

while (remain > 0) {
Instruction inst <read=ReadInstruction>;
remain -= sizeof(inst);
}
};

code需要以string对象开始
n表示code对象中ins字节数
Instruction结构根据指令码是否具有参数进行区分
有以下三类:
1.无参数指令码-1字节
2.有参数指令码-3字节(1字节指令码+2字节操作数)
3.有拓展参数指令码-4/6字节(嵌套一个ins)

pycTemplate.bt(版本太旧,pyc魔数会随版本变化,可以强行加入自己版本对应的魔数)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
//------------------------------------------------
//--- 010 Editor v3.0.4 Binary Template
//
// File: PYC.bt
// Authors: Kuang-che Wu
// Version: 1.1
// Purpose: Parse python bytecode .pyc and .pyo files,
// support python 2.4 to 2.7.
// Category: Programming
// File Mask: *.pyc,*.pyo
// ID Bytes:
// History:
// 1.1 2016-02-02 SweetScape: Updated header for repository submission.
// 1.0 2009-04-02 K Wu: Initial release.
//------------------------------------------------

enum <uint16> MagicValue {
PY_24a0 = 62041,
PY_24a3 = 62051,
PY_24b1 = 62061,
PY_25a0_1 = 62071,
PY_25a0_2 = 62081,
PY_25a0_3 = 62091,
PY_25a0_4 = 62092,
PY_25b3_1 = 62101,
PY_25b3_2 = 62111,
PY_25c1 = 62121,
PY_25c2 = 62131,
PY_26a0 = 62151,
PY_26a1 = 62161,
PY_27a0_1 = 62171,
PY_27a0_2 = 62181,
py_27b0_0 = 62211,//对应版本
};

// marshal obj type of version 2
// version 2 is backward compatible to version 1 (for read)
enum <char> ObjType {
TYPE_NULL = '0',
TYPE_NONE = 'N',
TYPE_FALSE = 'F',
TYPE_TRUE = 'T',
TYPE_STOPITER = 'S',
TYPE_ELLIPSIS = '.',
TYPE_INT = 'i',
TYPE_INT64 = 'I',
TYPE_FLOAT = 'f',
TYPE_BINARY_FLOAT = 'g',
TYPE_COMPLEX = 'x',
TYPE_BINARY_COMPLEX = 'y',
TYPE_LONG = 'l',
TYPE_STRING = 's',
TYPE_INTERNED = 't',
TYPE_STRINGREF = 'R',
TYPE_TUPLE = '(',
TYPE_LIST = '[',
TYPE_DICT = '{',
TYPE_CODE = 'c',
TYPE_UNICODE = 'u',
TYPE_UNKNOWN = '?',
TYPE_SET = '<',
TYPE_FROZENSET = '>',
};

// Python/import.c
struct Magic {
MagicValue magic1;
char magic2[2];
if (magic2 != "\x0d\x0a") {
Warning("bad magic");
return 0;
}
if (EnumToString(magic1) == "") {
Warning("Unknown magic version");
return 0;
}
};

// opcode.h
// this is opname of python 2.4
// please add new opcode in ReadInstruction()
enum <ubyte> OpCode {
STOP_CODE = 0,
POP_TOP = 1,
ROT_TWO = 2,
ROT_THREE = 3,
DUP_TOP = 4,
ROT_FOUR = 5,

UNARY_POSITIVE = 10,
UNARY_NEGATIVE = 11,
UNARY_NOT = 12,
UNARY_CONVERT = 13,

UNARY_INVERT = 15,

LIST_APPEND = 18,
BINARY_POWER = 19,

BINARY_MULTIPLY = 20,
BINARY_DIVIDE = 21,
BINARY_MODULO = 22,
BINARY_ADD = 23,
BINARY_SUBTRACT = 24,
BINARY_SUBSCR = 25,
BINARY_FLOOR_DIVIDE = 26,
BINARY_TRUE_DIVIDE = 27,
INPLACE_FLOOR_DIVIDE = 28,
INPLACE_TRUE_DIVIDE = 29,

SLICE = 30,
/* Also uses 31-33 */
SLICE_a = 31,
SLICE_b = 32,
SLICE_c = 33,

STORE_SLICE = 40,
/* Also uses 41-43 */
STORE_SLICE_a = 41,
STORE_SLICE_b = 42,
STORE_SLICE_c = 43,

DELETE_SLICE = 50,
/* Also uses 51-53 */
DELETE_SLICE_a = 51,
DELETE_SLICE_b = 52,
DELETE_SLICE_c = 53,

INPLACE_ADD = 55,
INPLACE_SUBTRACT = 56,
INPLACE_MULTIPLY = 57,
INPLACE_DIVIDE = 58,
INPLACE_MODULO = 59,
STORE_SUBSCR = 60,
DELETE_SUBSCR = 61,

BINARY_LSHIFT = 62,
BINARY_RSHIFT = 63,
BINARY_AND = 64,
BINARY_XOR = 65,
BINARY_OR = 66,
INPLACE_POWER = 67,
GET_ITER = 68,

PRINT_EXPR = 70,
PRINT_ITEM = 71,
PRINT_NEWLINE = 72,
PRINT_ITEM_TO = 73,
PRINT_NEWLINE_TO = 74,
INPLACE_LSHIFT = 75,
INPLACE_RSHIFT = 76,
INPLACE_AND = 77,
INPLACE_XOR = 78,
INPLACE_OR = 79,
BREAK_LOOP = 80,
WITH_CLEANUP = 81,
LOAD_LOCALS = 82,
RETURN_VALUE = 83,
IMPORT_STAR = 84,
EXEC_STMT = 85,
YIELD_VALUE = 86,
POP_BLOCK = 87,
END_FINALLY = 88,
BUILD_CLASS = 89,


STORE_NAME = 90, /* Index in name list */
DELETE_NAME = 91, /* "" */
UNPACK_SEQUENCE = 92, /* Number of sequence items */
FOR_ITER = 93,

STORE_ATTR = 95, /* Index in name list */
DELETE_ATTR = 96, /* "" */
STORE_GLOBAL = 97, /* "" */
DELETE_GLOBAL = 98, /* "" */
DUP_TOPX = 99, /* number of items to duplicate */
LOAD_CONST = 100, /* Index in const list */
LOAD_NAME = 101, /* Index in name list */
BUILD_TUPLE = 102, /* Number of tuple items */
BUILD_LIST = 103, /* Number of list items */
BUILD_MAP = 104, /* Always zero for now */
LOAD_ATTR = 105, /* Index in name list */
COMPARE_OP = 106, /* Comparison operator */
IMPORT_NAME = 107, /* Index in name list */
IMPORT_FROM = 108, /* Index in name list */

JUMP_FORWARD = 110, /* Number of bytes to skip */
JUMP_IF_FALSE = 111, /* "" */
JUMP_IF_TRUE = 112, /* "" */
JUMP_ABSOLUTE = 113, /* Target byte offset from beginning of code */

LOAD_GLOBAL = 116, /* Index in name list */

CONTINUE_LOOP = 119, /* Start of loop (absolute) */
SETUP_LOOP = 120, /* Target address (relative) */
SETUP_EXCEPT = 121, /* "" */
SETUP_FINALLY = 122, /* "" */

LOAD_FAST = 124, /* Local variable number */
STORE_FAST = 125, /* Local variable number */
DELETE_FAST = 126, /* Local variable number */

RAISE_VARARGS = 130, /* Number of raise arguments (1, 2 or 3) */
/* CALL_FUNCTION_XXX opcodes defined below depend on this definition */
CALL_FUNCTION = 131, /* #args + (#kwargs<<8) */
MAKE_FUNCTION = 132, /* #defaults */
BUILD_SLICE = 133, /* Number of items */

MAKE_CLOSURE = 134, /* #free vars */
LOAD_CLOSURE = 135, /* Load free variable from closure */
LOAD_DEREF = 136, /* Load and dereference from closure cell */
STORE_DEREF = 137, /* Store into cell */

/* The next 3 opcodes must be contiguous and satisfy
(CALL_FUNCTION_VAR - CALL_FUNCTION) & 3 == 1 */
CALL_FUNCTION_VAR = 140, /* #args + (#kwargs<<8) */
CALL_FUNCTION_KW = 141, /* #args + (#kwargs<<8) */
CALL_FUNCTION_VAR_KW = 142, /* #args + (#kwargs<<8) */

/* Support for opargs more than 16 bits long */
EXTENDED_ARG = 143,
};

// ceval.c
const int HAVE_ARGUMENT = 90;
const int EXTENDED_ARG = 143;
struct Instruction {
if (ReadUByte(FTell()) == EXTENDED_ARG) {
ubyte opcode_extended_arg;
uint16 oparg_hi;
ubyte opcode;
if (opcode >= HAVE_ARGUMENT)
uint16 oparg;
} else {
ubyte opcode;
if (opcode >= HAVE_ARGUMENT)
uint16 oparg;
}
};

typedef int32 r_long;
typedef int64 r_long64;
typedef int16 r_short;
typedef ubyte r_byte;

struct Code {
ObjType type;
if (type != TYPE_STRING) {
Warning("code not in string type");
Exit(1);
}
r_long n;
local int remain = n;
local int end = FTell() + n;

/* trick to optimize parse speed */
while (remain >= 6) {
Instruction inst[remain/6] <read=ReadInstruction,optimize=false>;
remain = end - FTell();
}
remain = end - FTell();

while (remain > 0) {
Instruction inst <read=ReadInstruction>;
remain -= sizeof(inst);
}
};

string Opcode2Opname(OpCode opcode)
{
uint16 magic = file.magic.magic1;
local string opname = EnumToString(opcode);
if (magic >= 0) { // history between python 2.0 and 2.4
// r27197
if (opcode == 114) opname = "";
// r28249
if (opcode == 81) opname = "RETURN_NONE";
// r28494
if (opcode == 81) opname = "";
// r32346
if (opcode == 9) opname = "NOP";
// r32389
if (opcode == 9) opcode = "";
// r35378
if (opcode == 18) opname = "LIST_APPEND";
// r36216
if (opcode == 9) opname = "NOP";
}
// magic 62041 r36242 marshal version 1
// magic 62051 r37112
// magic 62061 r37403
// magic 62071 r38931 marshal version 2
// magic 62081 r39773
if (magic >= 62091) { // r42624
// r42624
if (opcode == 81) opname = "WITH_CLEANUP";
}
// magic 62092 r42952
// magic 62101 r50600
// magic 62111 r50968
// magic 62121 r51082
// magic 62131 r51729
if (magic >= 62151) { // r59548
// r59548
if (opcode == 54) opname = "STORE_MAP";
}
// magic 62161 r61290
if (magic >= 62171) { // r67818
// r67818
if (opcode == 18) opname = "";
if (opcode == 94) opname = "LIST_APPEND";
}
if (magic >= 62181) { // r70071
// r70071
if (opcode == 111) opname = "JUMP_IF_FALSE_OR_POP";
if (opcode == 112) opname = "JUMP_IF_TRUE_OR_POP";
if (opcode == 114) opname = "POP_JUMP_IF_FALSE";
if (opcode == 115) opname = "POP_JUMP_IF_TRUE";
}

return opname;
}

string ReadInstruction(Instruction &ins)
{
string s;
uint16 magic = file.magic.magic1;
OpCode opcode = (OpCode)ins.opcode;
string opname = Opcode2Opname(opcode);
if (exists(ins.oparg)) {
uint32 oparg = ins.oparg;
if (exists(ins.oparg_hi))
oparg += (uint32)ins.oparg_hi << 16;
// Note, COMPARE_OP oparg change name in r24970
if (opname == "COMPARE_OP") {
string cmp_op;
switch (oparg) {
case 0: cmp_op = "<"; break;
case 1: cmp_op = "<="; break;
case 2: cmp_op = "=="; break;
case 3: cmp_op = "!="; break;
case 4: cmp_op = ">"; break;
case 5: cmp_op = ">="; break;
case 6: cmp_op = "in"; break;
case 7: cmp_op = "not in"; break;
case 8: cmp_op = "is"; break;
case 9: cmp_op = "is not"; break;
case 10: cmp_op = "exception match"; break;
case 11: cmp_op = "BAD"; break;
}
SPrintf(s, "%s (%s)", opname, cmp_op);

} else {
SPrintf(s, "%s %d", opname, oparg);
}
} else {
s = opname;
}
return s;
}

struct LnoTab {
ObjType type;
if (type != TYPE_STRING) {
Warning("lnotab not in string type");
Exit(1);
}
r_long n;
struct {
uchar bytecode_offset_diff;
uchar line_diff;
} pair[n/2];
};

// Python/marshal.c
typedef struct r_object {
ObjType type;
switch (type) {
case TYPE_NULL:
case TYPE_NONE:
case TYPE_STOPITER:
case TYPE_ELLIPSIS:
case TYPE_FALSE:
case TYPE_TRUE:
break;
case TYPE_INT:
r_long value;
break;
case TYPE_INT64:
r_long64 value;
break;
case TYPE_LONG:
r_long n;
local int size = n<0?-n:n;
r_short digit[size];
break;
case TYPE_FLOAT:
r_byte n;
char value[n];
break;
case TYPE_BINARY_FLOAT:
double value;
break;

case TYPE_COMPLEX:
r_byte nr;
char real[nr];
r_byte ni;
char imag[ni];
break;

case TYPE_BINARY_COMPLEX:
double real;
double imag;
break;

case TYPE_INTERNED:
case TYPE_STRING:
r_long n;
if (n)
char str[n];
break;
case TYPE_STRINGREF:
r_long n;
break;
case TYPE_TUPLE:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;
case TYPE_LIST:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;
case TYPE_DICT:
while (1) {
struct r_object key;
if (key.type == TYPE_NULL)
break;
struct r_object val;
}
break;
case TYPE_SET:
case TYPE_FROZENSET:
r_long n;
if (n)
struct r_object elements[n] <optimize=false>;
break;
case TYPE_CODE:
r_long argcount;
r_long nlocals;
r_long stacksize;
r_long flags;
//struct r_object code;
Code code;
struct r_object consts;
struct r_object names;
struct r_object varnames;
struct r_object freevars;
struct r_object cellvars;
struct r_object filename;
struct r_object name;
r_long firstlineno;
//struct r_object lnotab;
LnoTab lnotab;
break;
default:
Warning("unknown type code");
Exit(1);
}
} r_object;

struct {
Magic magic;
char mtime[4];
r_object data;
} file;