Micro-optimisations can speed up CPython
Last time, I bemoaned what compilers did to some of the CPython interpreter main loop. Following those remarks, there are three obvious courses of action:
- Make targeted improvements to the compilers.
- Write the interpreter main loop directly in assembly.
- Tweak the C source code to make it more amenable to good compilation.
Option three is the easiest to explore, so let's start with a random benchmark to use as a baseline:
Python 3.6.0+ (default, Mar 7 2017, 00:04:40)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from performance.benchmarks.bm_nbody import bench_nbody
>>> bench_nbody(10, 'sun', 100000)
9.907924063038081
The following patch is a little long, but each individual change is relatively boring, and all the changes are motivated by what we saw in gcc's assembly:
- Using
f->f_localsplus
instead offastlocals
, to save on a memory load. - Using
oparg
as an unsigned type, to save onmovsxd
instructions. - Computing
next_instr - first_instr
as a difference ofchar*
s rather thanuint16_t*
s, to save on asar
and anadd
.
diff --git a/Python/ceval.c b/Python/ceval.c
index d5172b9..79ccf2a 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -729,7 +729,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
int opcode; /* Current opcode */
int oparg; /* Current opcode argument, if any */
enum why_code why; /* Reason for block stack unwind */
- PyObject **fastlocals, **freevars;
+ PyObject **freevars;
PyObject *retval = NULL; /* Return value */
PyThreadState *tstate = PyThreadState_GET();
PyCodeObject *co;
@@ -865,7 +865,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
/* Code access macros */
/* The integer overflow is checked by an assertion below. */
-#define INSTR_OFFSET() (sizeof(_Py_CODEUNIT) * (int)(next_instr - first_instr))
+#define INSTR_OFFSET() ((char*)next_instr - (char*)first_instr)
#define NEXTOPARG() do { \
_Py_CODEUNIT word = *next_instr; \
opcode = _Py_OPCODE(word); \
@@ -959,7 +959,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
/* Local variable macros */
-#define GETLOCAL(i) (fastlocals[i])
+#define GETLOCAL(i) (f->f_localsplus[i])
/* The SETLOCAL() macro must not DECREF the local variable in-place and
then store the new value; it must copy the old value to a temporary
@@ -1045,7 +1045,6 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
co = f->f_code;
names = co->co_names;
consts = co->co_consts;
- fastlocals = f->f_localsplus;
freevars = f->f_localsplus + co->co_nlocals;
assert(PyBytes_Check(co->co_code));
assert(PyBytes_GET_SIZE(co->co_code) <= INT_MAX);
@@ -1228,7 +1227,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
FAST_DISPATCH();
TARGET(LOAD_FAST) {
- PyObject *value = GETLOCAL(oparg);
+ PyObject *value = GETLOCAL((unsigned)oparg);
if (value == NULL) {
format_exc_check_arg(PyExc_UnboundLocalError,
UNBOUNDLOCAL_ERROR_MSG,
@@ -1242,7 +1241,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
PREDICTED(LOAD_CONST);
TARGET(LOAD_CONST) {
- PyObject *value = GETITEM(consts, oparg);
+ PyObject *value = GETITEM(consts, (unsigned)oparg);
Py_INCREF(value);
PUSH(value);
FAST_DISPATCH();
@@ -1251,7 +1250,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
PREDICTED(STORE_FAST);
TARGET(STORE_FAST) {
PyObject *value = POP();
- SETLOCAL(oparg, value);
+ SETLOCAL((unsigned)oparg, value);
FAST_DISPATCH();
}
@@ -1526,7 +1525,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
TARGET(LIST_APPEND) {
PyObject *v = POP();
- PyObject *list = PEEK(oparg);
+ PyObject *list = PEEK((size_t)(unsigned)oparg);
int err;
err = PyList_Append(list, v);
Py_DECREF(v);
@@ -1731,7 +1730,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
_Py_IDENTIFIER(__annotations__);
PyObject *ann_dict;
PyObject *ann = POP();
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
int err;
if (f->f_locals == NULL) {
PyErr_Format(PyExc_SystemError,
@@ -2155,7 +2154,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(STORE_NAME) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *v = POP();
PyObject *ns = f->f_locals;
int err;
@@ -2176,7 +2175,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(DELETE_NAME) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *ns = f->f_locals;
int err;
if (ns == NULL) {
@@ -2198,7 +2197,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
TARGET(UNPACK_SEQUENCE) {
PyObject *seq = POP(), *item, **items;
if (PyTuple_CheckExact(seq) &&
- PyTuple_GET_SIZE(seq) == oparg) {
+ PyTuple_GET_SIZE(seq) == (Py_ssize_t)(size_t)(unsigned)oparg) {
items = ((PyTupleObject *)seq)->ob_item;
while (oparg--) {
item = items[oparg];
@@ -2206,7 +2205,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
PUSH(item);
}
} else if (PyList_CheckExact(seq) &&
- PyList_GET_SIZE(seq) == oparg) {
+ PyList_GET_SIZE(seq) == (Py_ssize_t)(size_t)(unsigned)oparg) {
items = ((PyListObject *)seq)->ob_item;
while (oparg--) {
item = items[oparg];
@@ -2215,7 +2214,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
} else if (unpack_iterable(seq, oparg, -1,
stack_pointer + oparg)) {
- STACKADJ(oparg);
+ STACKADJ((unsigned)oparg);
} else {
/* unpack_iterable() raised an exception */
Py_DECREF(seq);
@@ -2241,7 +2240,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(STORE_ATTR) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *owner = TOP();
PyObject *v = SECOND();
int err;
@@ -2255,7 +2254,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(DELETE_ATTR) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *owner = POP();
int err;
err = PyObject_SetAttr(owner, name, (PyObject *)NULL);
@@ -2266,7 +2265,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(STORE_GLOBAL) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *v = POP();
int err;
err = PyDict_SetItem(f->f_globals, name, v);
@@ -2277,7 +2276,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(DELETE_GLOBAL) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
int err;
err = PyDict_DelItem(f->f_globals, name);
if (err != 0) {
@@ -2289,7 +2288,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(LOAD_NAME) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *locals = f->f_locals;
PyObject *v;
if (locals == NULL) {
@@ -2340,7 +2339,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(LOAD_GLOBAL) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *v;
if (PyDict_CheckExact(f->f_globals)
&& PyDict_CheckExact(f->f_builtins))
@@ -2385,9 +2384,9 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(DELETE_FAST) {
- PyObject *v = GETLOCAL(oparg);
+ PyObject *v = GETLOCAL((unsigned)oparg);
if (v != NULL) {
- SETLOCAL(oparg, NULL);
+ SETLOCAL((unsigned)oparg, NULL);
DISPATCH();
}
format_exc_check_arg(
@@ -2488,7 +2487,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(BUILD_TUPLE) {
- PyObject *tup = PyTuple_New(oparg);
+ PyObject *tup = PyTuple_New((unsigned)oparg);
if (tup == NULL)
goto error;
while (--oparg >= 0) {
@@ -2500,7 +2499,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(BUILD_LIST) {
- PyObject *list = PyList_New(oparg);
+ PyObject *list = PyList_New((unsigned)oparg);
if (list == NULL)
goto error;
while (--oparg >= 0) {
@@ -2571,7 +2570,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
err = PySet_Add(set, item);
Py_DECREF(item);
}
- STACKADJ(-oparg);
+ STACKADJ(-(size_t)(unsigned)oparg);
if (err != 0) {
Py_DECREF(set);
goto error;
@@ -2601,7 +2600,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
TARGET(BUILD_MAP) {
Py_ssize_t i;
- PyObject *map = _PyDict_NewPresized((Py_ssize_t)oparg);
+ PyObject *map = _PyDict_NewPresized((size_t)(unsigned)oparg);
if (map == NULL)
goto error;
for (i = oparg; i > 0; i--) {
@@ -2684,12 +2683,12 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
PyObject *map;
PyObject *keys = TOP();
if (!PyTuple_CheckExact(keys) ||
- PyTuple_GET_SIZE(keys) != (Py_ssize_t)oparg) {
+ PyTuple_GET_SIZE(keys) != (Py_ssize_t)(size_t)(unsigned)oparg) {
PyErr_SetString(PyExc_SystemError,
"bad BUILD_CONST_KEY_MAP keys argument");
goto error;
}
- map = _PyDict_NewPresized((Py_ssize_t)oparg);
+ map = _PyDict_NewPresized((size_t)(unsigned)oparg);
if (map == NULL) {
goto error;
}
@@ -2746,7 +2745,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
for (i = oparg; i > 0; i--) {
PyObject *arg = PEEK(i);
if (_PyDict_MergeEx(sum, arg, 2) < 0) {
- PyObject *func = PEEK(2 + oparg);
+ PyObject *func = PEEK(2 + (unsigned)oparg);
if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
PyErr_Format(PyExc_TypeError,
"%.200s%.200s argument after ** "
@@ -2810,7 +2809,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(LOAD_ATTR) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *owner = TOP();
PyObject *res = PyObject_GetAttr(owner, name);
Py_DECREF(owner);
@@ -2835,7 +2834,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(IMPORT_NAME) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *fromlist = POP();
PyObject *level = TOP();
PyObject *res;
@@ -2869,7 +2868,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(IMPORT_FROM) {
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *from = TOP();
PyObject *res;
res = import_from(from, name);
@@ -2880,7 +2879,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(JUMP_FORWARD) {
- JUMPBY(oparg);
+ JUMPBY((unsigned)oparg);
FAST_DISPATCH();
}
@@ -2894,7 +2893,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
if (cond == Py_False) {
Py_DECREF(cond);
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
FAST_DISPATCH();
}
err = PyObject_IsTrue(cond);
@@ -2902,7 +2901,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
if (err > 0)
err = 0;
else if (err == 0)
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
else
goto error;
DISPATCH();
@@ -2918,14 +2917,14 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
if (cond == Py_True) {
Py_DECREF(cond);
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
FAST_DISPATCH();
}
err = PyObject_IsTrue(cond);
Py_DECREF(cond);
if (err > 0) {
err = 0;
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
}
else if (err == 0)
;
@@ -2943,7 +2942,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
FAST_DISPATCH();
}
if (cond == Py_False) {
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
FAST_DISPATCH();
}
err = PyObject_IsTrue(cond);
@@ -2953,7 +2952,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
err = 0;
}
else if (err == 0)
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
else
goto error;
DISPATCH();
@@ -2968,13 +2967,13 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
FAST_DISPATCH();
}
if (cond == Py_True) {
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
FAST_DISPATCH();
}
err = PyObject_IsTrue(cond);
if (err > 0) {
err = 0;
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
}
else if (err == 0) {
STACKADJ(-1);
@@ -2987,7 +2986,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
PREDICTED(JUMP_ABSOLUTE);
TARGET(JUMP_ABSOLUTE) {
- JUMPTO(oparg);
+ JUMPTO((unsigned)oparg);
#if FAST_LOOPS
/* Enabling this path speeds-up all while and for-loops by bypassing
the per-loop checks for signals. By default, this should be turned-off
@@ -3065,7 +3064,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
/* iterator ended normally */
STACKADJ(-1);
Py_DECREF(iter);
- JUMPBY(oparg);
+ JUMPBY((unsigned)oparg);
PREDICT(POP_BLOCK);
DISPATCH();
}
@@ -3076,7 +3075,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
}
TARGET(CONTINUE_LOOP) {
- retval = PyLong_FromLong(oparg);
+ retval = PyLong_FromLong((unsigned)oparg);
if (retval == NULL)
goto error;
why = WHY_CONTINUE;
@@ -3755,7 +3754,7 @@ format_missing(const char *kind, PyCodeObject *co, PyObject *names)
static void
missing_arguments(PyCodeObject *co, Py_ssize_t missing, Py_ssize_t defcount,
- PyObject **fastlocals)
+ PyFrameObject *f)
{
Py_ssize_t i, j = 0;
Py_ssize_t start, end;
@@ -3793,7 +3792,7 @@ missing_arguments(PyCodeObject *co, Py_ssize_t missing, Py_ssize_t defcount,
static void
too_many_positional(PyCodeObject *co, Py_ssize_t given, Py_ssize_t defcount,
- PyObject **fastlocals)
+ PyFrameObject *f)
{
int plural;
Py_ssize_t kwonly_given = 0;
@@ -3863,7 +3862,7 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
PyCodeObject* co = (PyCodeObject*)_co;
PyFrameObject *f;
PyObject *retval = NULL;
- PyObject **fastlocals, **freevars;
+ PyObject **freevars;
PyThreadState *tstate;
PyObject *x, *u;
const Py_ssize_t total_args = co->co_argcount + co->co_kwonlyargcount;
@@ -3883,7 +3882,6 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
if (f == NULL) {
return NULL;
}
- fastlocals = f->f_localsplus;
freevars = f->f_localsplus + co->co_nlocals;
/* Create a dictionary for keyword parameters (**kwags) */
@@ -3990,7 +3988,7 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
/* Check the number of positional arguments */
if (argcount > co->co_argcount && !(co->co_flags & CO_VARARGS)) {
- too_many_positional(co, argcount, defcount, fastlocals);
+ too_many_positional(co, argcount, defcount, f);
goto fail;
}
@@ -4004,7 +4002,7 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
}
}
if (missing) {
- missing_arguments(co, missing, defcount, fastlocals);
+ missing_arguments(co, missing, defcount, f);
goto fail;
}
if (n > m)
@@ -4039,7 +4037,7 @@ _PyEval_EvalCodeWithName(PyObject *_co, PyObject *globals, PyObject *locals,
missing++;
}
if (missing) {
- missing_arguments(co, missing, -1, fastlocals);
+ missing_arguments(co, missing, -1, f);
goto fail;
}
}
@@ -4845,7 +4843,6 @@ _PyFunction_FastCall(PyCodeObject *co, PyObject **args, Py_ssize_t nargs,
{
PyFrameObject *f;
PyThreadState *tstate = PyThreadState_GET();
- PyObject **fastlocals;
Py_ssize_t i;
PyObject *result;
@@ -4861,11 +4858,9 @@ _PyFunction_FastCall(PyCodeObject *co, PyObject **args, Py_ssize_t nargs,
return NULL;
}
- fastlocals = f->f_localsplus;
-
for (i = 0; i < nargs; i++) {
Py_INCREF(*args);
- fastlocals[i] = *args++;
+ f->f_localsplus[(size_t)i] = *args++;
}
result = PyEval_EvalFrameEx(f,0);
@@ -5335,9 +5330,8 @@ unicode_concatenate(PyObject *v, PyObject *w,
switch (opcode) {
case STORE_FAST:
{
- PyObject **fastlocals = f->f_localsplus;
- if (GETLOCAL(oparg) == v)
- SETLOCAL(oparg, NULL);
+ if (GETLOCAL((unsigned)oparg) == v)
+ SETLOCAL((unsigned)oparg, NULL);
break;
}
case STORE_DEREF:
@@ -5352,7 +5346,7 @@ unicode_concatenate(PyObject *v, PyObject *w,
case STORE_NAME:
{
PyObject *names = f->f_code->co_names;
- PyObject *name = GETITEM(names, oparg);
+ PyObject *name = GETITEM(names, (unsigned)oparg);
PyObject *locals = f->f_locals;
if (PyDict_CheckExact(locals) &&
PyDict_GetItem(locals, name) == v) {
With all of these changes applied, we get a 1.3% speedup:
Python 3.6.0+ (default, Mar 7 2017, 00:06:13)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from performance.benchmarks.bm_nbody import bench_nbody
>>> bench_nbody(10, 'sun', 100000)
9.777307317999657
A 1.3% speedup is simutaneously not very much, and also a surprisingly large amount for eliminating just a few instructions here and there. Of course, your mileage may vary, and this is just one randomly chosen benchmark.