From 930653418d26211ecbbdc2d7e46d1ed802730099 Mon Sep 17 00:00:00 2001
From: Roman Khimov <roman@nspcc.ru>
Date: Sat, 21 Aug 2021 19:09:44 +0300
Subject: [PATCH] vm: rework stack as a simple slice
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Double-linked list is quite expensive to manage especially given that it
requires microallocations for each Element. It can be replaced by simple slice
which is much more effective for simple push/pop operations that are very
typical in a VM. I've worried a little about more complex operations like
XDROP/1024 or REVERSEN/1024 because these require copying quite substantial
number of elements, but turns out these work fine too.

At the moment Element is kept as a convenient wrapper for Bytes/BigInt/Bool/etc
methods, but it can be changed in future. Many other potential optimizations
are also possible now.

Complex scripts:
name                    old time/op    new time/op    delta
ScriptFibonacci-8         1.11ms ± 2%    0.85ms ± 2%  -23.40%  (p=0.000 n=10+10)
ScriptNestedRefCount-8    1.46ms ± 2%    1.16ms ± 1%  -20.65%  (p=0.000 n=10+10)
ScriptPushPop/4-8         1.81µs ± 1%    1.54µs ± 4%  -14.96%  (p=0.000 n=8+10)
ScriptPushPop/16-8        4.88µs ± 2%    3.91µs ± 2%  -19.87%  (p=0.000 n=9+9)
ScriptPushPop/128-8       31.9µs ± 9%    26.7µs ± 3%  -16.28%  (p=0.000 n=9+8)
ScriptPushPop/1024-8       235µs ± 1%     192µs ± 3%  -18.31%  (p=0.000 n=9+10)

name                    old alloc/op   new alloc/op   delta
ScriptFibonacci-8          392kB ± 0%     123kB ± 0%  -68.68%  (p=0.000 n=8+8)
ScriptNestedRefCount-8     535kB ± 0%     266kB ± 0%  -50.38%  (p=0.000 n=6+10)
ScriptPushPop/4-8           352B ± 0%      160B ± 0%  -54.55%  (p=0.000 n=10+10)
ScriptPushPop/16-8        1.41kB ± 0%    0.64kB ± 0%  -54.55%  (p=0.000 n=10+10)
ScriptPushPop/128-8       11.3kB ± 0%     8.7kB ± 0%  -22.73%  (p=0.000 n=10+10)
ScriptPushPop/1024-8      90.1kB ± 0%    73.2kB ± 0%  -18.75%  (p=0.000 n=10+10)

name                    old allocs/op  new allocs/op  delta
ScriptFibonacci-8          9.14k ± 0%     3.53k ± 0%  -61.41%  (p=0.000 n=10+10)
ScriptNestedRefCount-8     17.4k ± 0%     11.8k ± 0%  -32.35%  (p=0.000 n=10+10)
ScriptPushPop/4-8           12.0 ± 0%       8.0 ± 0%  -33.33%  (p=0.000 n=10+10)
ScriptPushPop/16-8          48.0 ± 0%      32.0 ± 0%  -33.33%  (p=0.000 n=10+10)
ScriptPushPop/128-8          384 ± 0%       259 ± 0%  -32.55%  (p=0.000 n=10+10)
ScriptPushPop/1024-8       3.07k ± 0%     2.05k ± 0%  -33.14%  (p=0.000 n=10+10)

Some stack-management opcodes:

name                                 old time/op    new time/op    delta
Opcodes/XDROP/0/1-8                     255ns ± 9%     273ns ±11%    +6.92%  (p=0.016 n=11+10)
Opcodes/XDROP/0/1024-8                  362ns ± 2%     365ns ± 8%      ~     (p=0.849 n=10+11)
Opcodes/XDROP/1024/1024-8              3.20µs ± 2%    1.99µs ±12%   -37.69%  (p=0.000 n=11+11)
Opcodes/XDROP/2047/2048-8              6.55µs ± 3%    1.75µs ± 5%   -73.26%  (p=0.000 n=10+11)
Opcodes/DUP/null-8                      414ns ± 6%     245ns ±12%   -40.88%  (p=0.000 n=11+11)
Opcodes/DUP/boolean-8                   411ns ± 8%     245ns ± 6%   -40.31%  (p=0.000 n=11+11)
Opcodes/DUP/integer/small-8             684ns ± 8%     574ns ± 3%   -16.02%  (p=0.000 n=11+10)
Opcodes/DUP/integer/big-8               675ns ± 6%     601ns ±10%   -10.98%  (p=0.000 n=11+11)
Opcodes/DUP/bytearray/small-8           675ns ±10%     566ns ±10%   -16.22%  (p=0.000 n=11+11)
Opcodes/DUP/bytearray/big-8            6.39µs ±11%    6.13µs ± 3%      ~     (p=0.148 n=10+10)
Opcodes/DUP/buffer/small-8              412ns ± 5%     261ns ± 8%   -36.55%  (p=0.000 n=9+11)
Opcodes/DUP/buffer/big-8                586ns ±10%     337ns ± 7%   -42.53%  (p=0.000 n=11+11)
Opcodes/DUP/struct/small-8              458ns ±12%     256ns ±12%   -44.09%  (p=0.000 n=11+11)
Opcodes/DUP/struct/big-8                489ns ± 7%     274ns ± 5%   -44.06%  (p=0.000 n=10+10)
Opcodes/DUP/pointer-8                   586ns ± 7%     494ns ± 7%   -15.67%  (p=0.000 n=11+11)
Opcodes/OVER/null-8                     450ns ±14%     264ns ±10%   -41.30%  (p=0.000 n=11+11)
Opcodes/OVER/boolean-8                  450ns ±14%     264ns ±10%   -41.31%  (p=0.000 n=11+11)
Opcodes/OVER/integer/small-8            716ns ± 9%     604ns ± 6%   -15.65%  (p=0.000 n=11+11)
Opcodes/OVER/integer/big-8              696ns ± 5%     634ns ± 6%    -8.89%  (p=0.000 n=10+11)
Opcodes/OVER/bytearray/small-8          693ns ± 1%     539ns ± 9%   -22.18%  (p=0.000 n=9+10)
Opcodes/OVER/bytearray/big-8           6.33µs ± 2%    6.16µs ± 4%    -2.79%  (p=0.004 n=8+10)
Opcodes/OVER/buffer/small-8             415ns ± 4%     263ns ± 8%   -36.76%  (p=0.000 n=9+11)
Opcodes/OVER/buffer/big-8               587ns ± 5%     342ns ± 7%   -41.70%  (p=0.000 n=11+11)
Opcodes/OVER/struct/small-8             446ns ±14%     257ns ± 8%   -42.42%  (p=0.000 n=11+11)
Opcodes/OVER/struct/big-8               607ns ±26%     278ns ± 7%   -54.25%  (p=0.000 n=11+11)
Opcodes/OVER/pointer-8                  645ns ±12%     476ns ±10%   -26.21%  (p=0.000 n=11+11)
Opcodes/PICK/2/null-8                   460ns ±11%     264ns ± 9%   -42.68%  (p=0.000 n=11+11)
Opcodes/PICK/2/boolean-8                460ns ± 4%     260ns ± 4%   -43.37%  (p=0.000 n=8+11)
Opcodes/PICK/2/integer/small-8          725ns ± 7%     557ns ± 4%   -23.19%  (p=0.000 n=11+10)
Opcodes/PICK/2/integer/big-8            722ns ±12%     582ns ± 6%   -19.51%  (p=0.000 n=11+11)
Opcodes/PICK/2/bytearray/small-8        705ns ± 6%     545ns ± 4%   -22.69%  (p=0.000 n=11+11)
Opcodes/PICK/2/bytearray/big-8         7.17µs ±36%    6.37µs ± 8%      ~     (p=0.065 n=11+11)
Opcodes/PICK/2/buffer/small-8           427ns ± 8%     253ns ± 8%   -40.82%  (p=0.000 n=11+11)
Opcodes/PICK/2/buffer/big-8             590ns ± 3%     331ns ± 6%   -43.83%  (p=0.000 n=11+11)
Opcodes/PICK/2/struct/small-8           428ns ± 8%     254ns ± 7%   -40.64%  (p=0.000 n=11+11)
Opcodes/PICK/2/struct/big-8             489ns ±15%     283ns ± 7%   -42.11%  (p=0.000 n=11+11)
Opcodes/PICK/2/pointer-8                553ns ± 7%     414ns ± 8%   -25.18%  (p=0.000 n=11+11)
Opcodes/PICK/1024/null-8                531ns ± 4%     327ns ± 6%   -38.49%  (p=0.000 n=10+10)
Opcodes/PICK/1024/boolean-8             527ns ± 5%     318ns ± 5%   -39.78%  (p=0.000 n=11+9)
Opcodes/PICK/1024/integer/small-8       861ns ± 4%     683ns ± 4%   -20.66%  (p=0.000 n=11+11)
Opcodes/PICK/1024/integer/big-8         882ns ± 4%    1060ns ±47%      ~     (p=0.748 n=11+11)
Opcodes/PICK/1024/bytearray/small-8     850ns ± 4%     671ns ± 5%   -21.12%  (p=0.000 n=10+11)
Opcodes/PICK/1024/bytearray/big-8      6.32µs ±26%    6.75µs ± 4%    +6.86%  (p=0.019 n=10+11)
Opcodes/PICK/1024/buffer/small-8        530ns ± 6%     324ns ± 5%   -38.86%  (p=0.000 n=10+11)
Opcodes/PICK/1024/buffer/big-8          570ns ± 4%     417ns ±45%   -26.82%  (p=0.001 n=11+10)
Opcodes/PICK/1024/struct/small-8      1.11µs ±122%    0.34µs ±11%   -69.38%  (p=0.000 n=11+10)
Opcodes/PICK/1024/pointer-8             693ns ± 5%     568ns ±31%   -18.10%  (p=0.002 n=10+10)
Opcodes/TUCK/null-8                     450ns ±10%     275ns ± 8%   -38.93%  (p=0.000 n=11+11)
Opcodes/TUCK/boolean-8                  449ns ±13%     268ns ± 9%   -40.16%  (p=0.000 n=11+10)
Opcodes/TUCK/integer/small-8            716ns ± 7%     599ns ± 7%   -16.30%  (p=0.000 n=11+11)
Opcodes/TUCK/integer/big-8              718ns ± 8%     613ns ±11%   -14.55%  (p=0.000 n=11+11)
Opcodes/TUCK/bytearray/small-8          700ns ±12%     558ns ± 7%   -20.39%  (p=0.000 n=11+11)
Opcodes/TUCK/bytearray/big-8           5.88µs ± 7%    6.37µs ± 3%    +8.31%  (p=0.000 n=10+11)
Opcodes/TUCK/buffer/small-8             425ns ± 6%     258ns ±12%   -39.28%  (p=0.000 n=11+11)
Opcodes/TUCK/buffer/big-8               553ns ±19%     334ns ± 6%   -39.57%  (p=0.000 n=11+11)
Opcodes/TUCK/struct/small-8             474ns ± 3%     263ns ±12%   -44.51%  (p=0.000 n=10+11)
Opcodes/TUCK/struct/big-8               641ns ±24%     284ns ± 8%   -55.63%  (p=0.000 n=11+11)
Opcodes/TUCK/pointer-8                  635ns ±13%     468ns ±16%   -26.31%  (p=0.000 n=11+11)
Opcodes/SWAP/null-8                     227ns ±31%     212ns ±11%      ~     (p=0.847 n=11+11)
Opcodes/SWAP/integer-8                  233ns ±32%     210ns ±14%      ~     (p=0.072 n=10+11)
Opcodes/SWAP/big_bytes-8                263ns ±39%     211ns ±11%      ~     (p=0.056 n=11+11)
Opcodes/ROT/null-8                      308ns ±68%     223ns ±12%      ~     (p=0.519 n=11+11)
Opcodes/ROT/integer-8                   226ns ±25%     228ns ± 9%      ~     (p=0.705 n=10+11)
Opcodes/ROT/big_bytes-8                 215ns ±18%     218ns ± 7%      ~     (p=0.756 n=10+11)
Opcodes/ROLL/4/null-8                   269ns ±10%     295ns ± 9%    +9.42%  (p=0.002 n=10+11)
Opcodes/ROLL/4/integer-8                344ns ±48%     280ns ± 2%      ~     (p=0.882 n=11+9)
Opcodes/ROLL/4/big_bytes-8              276ns ±13%     288ns ± 4%    +4.38%  (p=0.046 n=9+11)
Opcodes/ROLL/1024/null-8               4.21µs ±70%    1.01µs ± 9%   -76.15%  (p=0.000 n=11+11)
Opcodes/ROLL/1024/integer-8            4.78µs ±82%    0.71µs ± 3%   -85.06%  (p=0.000 n=11+11)
Opcodes/ROLL/1024/big_bytes-8          3.28µs ± 5%    1.35µs ±36%   -58.91%  (p=0.000 n=9+11)
Opcodes/REVERSE3/null-8                 219ns ± 9%     224ns ± 9%      ~     (p=0.401 n=11+11)
Opcodes/REVERSE3/integer-8              261ns ±28%     220ns ± 6%   -15.67%  (p=0.015 n=11+11)
Opcodes/REVERSE3/big_bytes-8            245ns ±31%     218ns ± 7%      ~     (p=0.051 n=10+11)
Opcodes/REVERSE4/null-8                 223ns ±10%     218ns ± 6%      ~     (p=0.300 n=11+11)
Opcodes/REVERSE4/integer-8              233ns ±10%     220ns ± 7%    -5.74%  (p=0.016 n=11+11)
Opcodes/REVERSE4/big_bytes-8            225ns ±10%     220ns ± 7%      ~     (p=0.157 n=10+11)
Opcodes/REVERSEN/5/null-8               281ns ±12%     277ns ± 4%      ~     (p=0.847 n=11+11)
Opcodes/REVERSEN/5/integer-8            280ns ±11%     275ns ± 5%      ~     (p=0.243 n=11+11)
Opcodes/REVERSEN/5/big_bytes-8          283ns ± 9%     276ns ± 7%      ~     (p=0.133 n=11+11)
Opcodes/REVERSEN/1024/null-8           4.85µs ± 6%    1.94µs ± 6%   -60.07%  (p=0.000 n=10+11)
Opcodes/REVERSEN/1024/integer-8        4.97µs ± 7%    1.99µs ±22%   -59.88%  (p=0.000 n=11+11)
Opcodes/REVERSEN/1024/big_bytes-8      5.11µs ±10%    2.00µs ± 4%   -60.87%  (p=0.000 n=10+9)
Opcodes/PACK/1-8                       1.22µs ± 7%    0.95µs ± 6%   -22.17%  (p=0.000 n=10+11)
Opcodes/PACK/255-8                     11.1µs ± 4%    10.2µs ± 6%    -7.96%  (p=0.000 n=11+11)
Opcodes/PACK/1024-8                    38.9µs ± 4%    37.4µs ± 9%      ~     (p=0.173 n=10+11)
Opcodes/UNPACK/1-8                     1.32µs ±34%    0.96µs ± 6%   -27.57%  (p=0.000 n=10+11)
Opcodes/UNPACK/255-8                   27.2µs ±14%    16.0µs ±13%   -41.04%  (p=0.000 n=11+11)
Opcodes/UNPACK/1024-8                   102µs ±10%      64µs ±16%   -37.33%  (p=0.000 n=10+11)

name                                 old alloc/op   new alloc/op   delta
Opcodes/XDROP/0/1-8                     0.00B          0.00B           ~     (all equal)
Opcodes/XDROP/0/1024-8                  0.00B          0.00B           ~     (all equal)
Opcodes/XDROP/1024/1024-8               0.00B          0.00B           ~     (all equal)
Opcodes/XDROP/2047/2048-8               0.00B          0.00B           ~     (all equal)
Opcodes/DUP/null-8                      48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/boolean-8                   48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/integer/small-8             96.0B ± 0%     48.0B ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/DUP/integer/big-8                104B ± 0%       56B ± 0%   -46.15%  (p=0.000 n=11+11)
Opcodes/DUP/bytearray/small-8           88.0B ± 0%     40.0B ± 0%   -54.55%  (p=0.000 n=11+11)
Opcodes/DUP/bytearray/big-8            65.6kB ± 0%    65.6kB ± 0%    -0.07%  (p=0.000 n=10+9)
Opcodes/DUP/buffer/small-8              48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/buffer/big-8                48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/struct/small-8              48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/struct/big-8                48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/pointer-8                    112B ± 0%       64B ± 0%   -42.86%  (p=0.000 n=11+11)
Opcodes/OVER/null-8                     48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/boolean-8                  48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/integer/small-8            96.0B ± 0%     48.0B ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/OVER/integer/big-8               104B ± 0%       56B ± 0%   -46.15%  (p=0.000 n=11+11)
Opcodes/OVER/bytearray/small-8          88.0B ± 0%     40.0B ± 0%   -54.55%  (p=0.000 n=11+11)
Opcodes/OVER/bytearray/big-8           65.6kB ± 0%    65.6kB ± 0%    -0.07%  (p=0.000 n=9+11)
Opcodes/OVER/buffer/small-8             48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/buffer/big-8               48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/struct/small-8             48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/struct/big-8               48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/pointer-8                   112B ± 0%       64B ± 0%   -42.86%  (p=0.000 n=11+11)
Opcodes/PICK/2/null-8                   48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/boolean-8                48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/integer/small-8          96.0B ± 0%     48.0B ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/integer/big-8             104B ± 0%       56B ± 0%   -46.15%  (p=0.000 n=11+11)
Opcodes/PICK/2/bytearray/small-8        88.0B ± 0%     40.0B ± 0%   -54.55%  (p=0.000 n=11+11)
Opcodes/PICK/2/bytearray/big-8         65.6kB ± 0%    65.6kB ± 0%    -0.07%  (p=0.001 n=9+11)
Opcodes/PICK/2/buffer/small-8           48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/buffer/big-8             48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/struct/small-8           48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/struct/big-8             48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/pointer-8                 112B ± 0%       64B ± 0%   -42.86%  (p=0.000 n=11+11)
Opcodes/PICK/1024/null-8                48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/boolean-8             48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/integer/small-8       96.0B ± 0%     48.0B ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/integer/big-8          104B ± 0%       56B ± 0%   -46.15%  (p=0.000 n=11+11)
Opcodes/PICK/1024/bytearray/small-8     88.0B ± 0%     40.0B ± 0%   -54.55%  (p=0.000 n=11+11)
Opcodes/PICK/1024/bytearray/big-8      65.6kB ± 0%    65.6kB ± 0%    -0.07%  (p=0.000 n=11+11)
Opcodes/PICK/1024/buffer/small-8        48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/buffer/big-8          48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/struct/small-8        48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/pointer-8              112B ± 0%       64B ± 0%   -42.86%  (p=0.000 n=11+11)
Opcodes/TUCK/null-8                     48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/boolean-8                  48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/integer/small-8            96.0B ± 0%     48.0B ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/TUCK/integer/big-8               104B ± 0%       56B ± 0%   -46.15%  (p=0.000 n=11+11)
Opcodes/TUCK/bytearray/small-8          88.0B ± 0%     40.0B ± 0%   -54.55%  (p=0.000 n=11+11)
Opcodes/TUCK/bytearray/big-8           65.6kB ± 0%    65.6kB ± 0%    -0.07%  (p=0.000 n=10+11)
Opcodes/TUCK/buffer/small-8             48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/buffer/big-8               48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/struct/small-8             48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/struct/big-8               48.0B ± 0%      0.0B       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/pointer-8                   112B ± 0%       64B ± 0%   -42.86%  (p=0.000 n=11+11)
Opcodes/SWAP/null-8                     0.00B          0.00B           ~     (all equal)
Opcodes/SWAP/integer-8                  0.00B          0.00B           ~     (all equal)
Opcodes/SWAP/big_bytes-8                0.00B          0.00B           ~     (all equal)
Opcodes/ROT/null-8                      0.00B          0.00B           ~     (all equal)
Opcodes/ROT/integer-8                   0.00B          0.00B           ~     (all equal)
Opcodes/ROT/big_bytes-8                 0.00B          0.00B           ~     (all equal)
Opcodes/ROLL/4/null-8                   0.00B          0.00B           ~     (all equal)
Opcodes/ROLL/4/integer-8                0.00B          0.00B           ~     (all equal)
Opcodes/ROLL/4/big_bytes-8              0.00B          0.00B           ~     (all equal)
Opcodes/ROLL/1024/null-8                0.00B          0.00B           ~     (all equal)
Opcodes/ROLL/1024/integer-8             0.00B          0.00B           ~     (all equal)
Opcodes/ROLL/1024/big_bytes-8           0.00B          0.00B           ~     (all equal)
Opcodes/REVERSE3/null-8                 0.00B          0.00B           ~     (all equal)
Opcodes/REVERSE3/integer-8              0.00B          0.00B           ~     (all equal)
Opcodes/REVERSE3/big_bytes-8            0.00B          0.00B           ~     (all equal)
Opcodes/REVERSE4/null-8                 0.00B          0.00B           ~     (all equal)
Opcodes/REVERSE4/integer-8              0.00B          0.00B           ~     (all equal)
Opcodes/REVERSE4/big_bytes-8            0.00B          0.00B           ~     (all equal)
Opcodes/REVERSEN/5/null-8               0.00B          0.00B           ~     (all equal)
Opcodes/REVERSEN/5/integer-8            0.00B          0.00B           ~     (all equal)
Opcodes/REVERSEN/5/big_bytes-8          0.00B          0.00B           ~     (all equal)
Opcodes/REVERSEN/1024/null-8            0.00B          0.00B           ~     (all equal)
Opcodes/REVERSEN/1024/integer-8         0.00B          0.00B           ~     (all equal)
Opcodes/REVERSEN/1024/big_bytes-8       0.00B          0.00B           ~     (all equal)
Opcodes/PACK/1-8                         144B ± 0%       96B ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PACK/255-8                     4.22kB ± 0%    4.18kB ± 0%    -1.14%  (p=0.000 n=11+11)
Opcodes/PACK/1024-8                    16.5kB ± 0%    16.5kB ± 0%    -0.29%  (p=0.000 n=11+11)
Opcodes/UNPACK/1-8                       168B ± 0%       72B ± 0%   -57.14%  (p=0.000 n=11+11)
Opcodes/UNPACK/255-8                   12.4kB ± 0%     7.8kB ± 0%   -37.28%  (p=0.000 n=11+11)
Opcodes/UNPACK/1024-8                  49.3kB ± 0%    52.8kB ± 0%    +7.18%  (p=0.000 n=11+11)

name                                 old allocs/op  new allocs/op  delta
Opcodes/XDROP/0/1-8                      0.00           0.00           ~     (all equal)
Opcodes/XDROP/0/1024-8                   0.00           0.00           ~     (all equal)
Opcodes/XDROP/1024/1024-8                0.00           0.00           ~     (all equal)
Opcodes/XDROP/2047/2048-8                0.00           0.00           ~     (all equal)
Opcodes/DUP/null-8                       1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/boolean-8                    1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/integer/small-8              3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/DUP/integer/big-8                3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/DUP/bytearray/small-8            3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/DUP/bytearray/big-8              3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/DUP/buffer/small-8               1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/buffer/big-8                 1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/struct/small-8               1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/struct/big-8                 1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/DUP/pointer-8                    2.00 ± 0%      1.00 ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/OVER/null-8                      1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/boolean-8                   1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/integer/small-8             3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/OVER/integer/big-8               3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/OVER/bytearray/small-8           3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/OVER/bytearray/big-8             3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/OVER/buffer/small-8              1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/buffer/big-8                1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/struct/small-8              1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/struct/big-8                1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/OVER/pointer-8                   2.00 ± 0%      1.00 ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/null-8                    1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/boolean-8                 1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/integer/small-8           3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PICK/2/integer/big-8             3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PICK/2/bytearray/small-8         3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PICK/2/bytearray/big-8           3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PICK/2/buffer/small-8            1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/buffer/big-8              1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/struct/small-8            1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/struct/big-8              1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/2/pointer-8                 2.00 ± 0%      1.00 ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/null-8                 1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/boolean-8              1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/integer/small-8        3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PICK/1024/integer/big-8          3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PICK/1024/bytearray/small-8      3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PICK/1024/bytearray/big-8        3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/PICK/1024/buffer/small-8         1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/buffer/big-8           1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/struct/small-8         1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/PICK/1024/pointer-8              2.00 ± 0%      1.00 ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/TUCK/null-8                      1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/boolean-8                   1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/integer/small-8             3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/TUCK/integer/big-8               3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/TUCK/bytearray/small-8           3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/TUCK/bytearray/big-8             3.00 ± 0%      2.00 ± 0%   -33.33%  (p=0.000 n=11+11)
Opcodes/TUCK/buffer/small-8              1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/buffer/big-8                1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/struct/small-8              1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/struct/big-8                1.00 ± 0%      0.00       -100.00%  (p=0.000 n=11+11)
Opcodes/TUCK/pointer-8                   2.00 ± 0%      1.00 ± 0%   -50.00%  (p=0.000 n=11+11)
Opcodes/SWAP/null-8                      0.00           0.00           ~     (all equal)
Opcodes/SWAP/integer-8                   0.00           0.00           ~     (all equal)
Opcodes/SWAP/big_bytes-8                 0.00           0.00           ~     (all equal)
Opcodes/ROT/null-8                       0.00           0.00           ~     (all equal)
Opcodes/ROT/integer-8                    0.00           0.00           ~     (all equal)
Opcodes/ROT/big_bytes-8                  0.00           0.00           ~     (all equal)
Opcodes/ROLL/4/null-8                    0.00           0.00           ~     (all equal)
Opcodes/ROLL/4/integer-8                 0.00           0.00           ~     (all equal)
Opcodes/ROLL/4/big_bytes-8               0.00           0.00           ~     (all equal)
Opcodes/ROLL/1024/null-8                 0.00           0.00           ~     (all equal)
Opcodes/ROLL/1024/integer-8              0.00           0.00           ~     (all equal)
Opcodes/ROLL/1024/big_bytes-8            0.00           0.00           ~     (all equal)
Opcodes/REVERSE3/null-8                  0.00           0.00           ~     (all equal)
Opcodes/REVERSE3/integer-8               0.00           0.00           ~     (all equal)
Opcodes/REVERSE3/big_bytes-8             0.00           0.00           ~     (all equal)
Opcodes/REVERSE4/null-8                  0.00           0.00           ~     (all equal)
Opcodes/REVERSE4/integer-8               0.00           0.00           ~     (all equal)
Opcodes/REVERSE4/big_bytes-8             0.00           0.00           ~     (all equal)
Opcodes/REVERSEN/5/null-8                0.00           0.00           ~     (all equal)
Opcodes/REVERSEN/5/integer-8             0.00           0.00           ~     (all equal)
Opcodes/REVERSEN/5/big_bytes-8           0.00           0.00           ~     (all equal)
Opcodes/REVERSEN/1024/null-8             0.00           0.00           ~     (all equal)
Opcodes/REVERSEN/1024/integer-8          0.00           0.00           ~     (all equal)
Opcodes/REVERSEN/1024/big_bytes-8        0.00           0.00           ~     (all equal)
Opcodes/PACK/1-8                         5.00 ± 0%      4.00 ± 0%   -20.00%  (p=0.000 n=11+11)
Opcodes/PACK/255-8                       5.00 ± 0%      4.00 ± 0%   -20.00%  (p=0.000 n=11+11)
Opcodes/PACK/1024-8                      5.00 ± 0%      4.00 ± 0%   -20.00%  (p=0.000 n=11+11)
Opcodes/UNPACK/1-8                       5.00 ± 0%      3.00 ± 0%   -40.00%  (p=0.000 n=11+11)
Opcodes/UNPACK/255-8                      259 ± 0%         7 ± 0%   -97.30%  (p=0.000 n=11+11)
Opcodes/UNPACK/1024-8                   1.03k ± 0%     0.01k ± 0%   -98.93%  (p=0.000 n=11+11)
---
 pkg/core/blockchain.go |   5 +-
 pkg/vm/bench_test.go   |  50 ++++++++
 pkg/vm/context.go      |   5 +-
 pkg/vm/debug_test.go   |   6 +-
 pkg/vm/stack.go        | 272 ++++++++++++++---------------------------
 pkg/vm/stack_test.go   |  45 ++++---
 pkg/vm/vm.go           | 101 +++++++--------
 pkg/vm/vm_test.go      |  14 +--
 8 files changed, 228 insertions(+), 270 deletions(-)
 create mode 100644 pkg/vm/bench_test.go

diff --git a/pkg/core/blockchain.go b/pkg/core/blockchain.go
index 88e3615d3..1a20308f2 100644
--- a/pkg/core/blockchain.go
+++ b/pkg/core/blockchain.go
@@ -1917,8 +1917,9 @@ func (bc *Blockchain) verifyHashAgainstScript(hash util.Uint160, witness *transa
 	if vm.HasFailed() {
 		return 0, fmt.Errorf("%w: vm execution has failed: %v", ErrVerificationFailed, err)
 	}
-	resEl := vm.Estack().Pop()
-	if resEl != nil {
+	estack := vm.Estack()
+	if estack.Len() > 0 {
+		resEl := estack.Pop()
 		res, err := resEl.Item().TryBool()
 		if err != nil {
 			return 0, fmt.Errorf("%w: invalid return value", ErrVerificationFailed)
diff --git a/pkg/vm/bench_test.go b/pkg/vm/bench_test.go
new file mode 100644
index 000000000..c0403ea8b
--- /dev/null
+++ b/pkg/vm/bench_test.go
@@ -0,0 +1,50 @@
+package vm
+
+import (
+	"encoding/base64"
+	"strconv"
+	"testing"
+
+	"github.com/nspcc-dev/neo-go/pkg/vm/opcode"
+	"github.com/stretchr/testify/require"
+)
+
+func benchScript(t *testing.B, script []byte) {
+	for n := 0; n < t.N; n++ {
+		t.StopTimer()
+		vm := load(script)
+		t.StartTimer()
+		err := vm.Run()
+		t.StopTimer()
+		require.NoError(t, err)
+		t.StartTimer()
+	}
+}
+
+// Shared as is by @ixje once upon a time (compiled from Python).
+func BenchmarkScriptFibonacci(t *testing.B) {
+	var script = []byte{87, 5, 0, 16, 112, 17, 113, 105, 104, 18, 192, 114, 16, 115, 34, 28, 104, 105, 158, 116, 106, 108, 75,
+		217, 48, 38, 5, 139, 34, 5, 207, 34, 3, 114, 105, 112, 108, 113, 107, 17, 158, 115, 107, 12, 2, 94, 1,
+		219, 33, 181, 36, 222, 106, 64}
+	benchScript(t, script)
+}
+
+func BenchmarkScriptNestedRefCount(t *testing.B) {
+	b64script := "whBNEcARTRHAVgEB/gGdYBFNEU0SwFMSwFhKJPNFUUVFRQ=="
+	script, err := base64.StdEncoding.DecodeString(b64script)
+	require.NoError(t, err)
+	benchScript(t, script)
+}
+
+func BenchmarkScriptPushPop(t *testing.B) {
+	for _, i := range []int{4, 16, 128, 1024} {
+		t.Run(strconv.Itoa(i), func(t *testing.B) {
+			var script = make([]byte, i*2)
+			for p := 0; p < i; p++ {
+				script[p] = byte(opcode.PUSH1)
+				script[i+p] = byte(opcode.DROP)
+			}
+			benchScript(t, script)
+		})
+	}
+}
diff --git a/pkg/vm/context.go b/pkg/vm/context.go
index 9acd355d3..25d29f2fb 100644
--- a/pkg/vm/context.go
+++ b/pkg/vm/context.go
@@ -282,10 +282,11 @@ func (c *Context) IsDeployed() bool {
 // getContextScriptHash returns script hash of the invocation stack element
 // number n.
 func (v *VM) getContextScriptHash(n int) util.Uint160 {
-	element := v.Istack().Peek(n)
-	if element == nil {
+	istack := v.Istack()
+	if istack.Len() <= n {
 		return util.Uint160{}
 	}
+	element := istack.Peek(n)
 	ctxIface := element.Value()
 	ctx := ctxIface.(*Context)
 	return ctx.ScriptHash()
diff --git a/pkg/vm/debug_test.go b/pkg/vm/debug_test.go
index 7e6a8877e..c30964b82 100644
--- a/pkg/vm/debug_test.go
+++ b/pkg/vm/debug_test.go
@@ -20,7 +20,7 @@ func TestVM_Debug(t *testing.T) {
 		require.NoError(t, v.Run())
 		require.Equal(t, 5, v.Context().NextIP())
 		require.NoError(t, v.Run())
-		require.Equal(t, 1, v.estack.len)
+		require.Equal(t, 1, v.estack.Len())
 		require.Equal(t, big.NewInt(5), v.estack.Top().Value())
 	})
 	t.Run("StepInto", func(t *testing.T) {
@@ -29,14 +29,14 @@ func TestVM_Debug(t *testing.T) {
 		require.Equal(t, 3, v.Context().NextIP())
 		require.NoError(t, v.StepOut())
 		require.Equal(t, 2, v.Context().NextIP())
-		require.Equal(t, 1, v.estack.len)
+		require.Equal(t, 1, v.estack.Len())
 		require.Equal(t, big.NewInt(5), v.estack.Top().Value())
 	})
 	t.Run("StepOver", func(t *testing.T) {
 		v := load(prog)
 		require.NoError(t, v.StepOver())
 		require.Equal(t, 2, v.Context().NextIP())
-		require.Equal(t, 1, v.estack.len)
+		require.Equal(t, 1, v.estack.Len())
 		require.Equal(t, big.NewInt(5), v.estack.Top().Value())
 	})
 }
diff --git a/pkg/vm/stack.go b/pkg/vm/stack.go
index 5dbe31e6d..8a300628a 100644
--- a/pkg/vm/stack.go
+++ b/pkg/vm/stack.go
@@ -9,70 +9,36 @@ import (
 	"github.com/nspcc-dev/neo-go/pkg/vm/stackitem"
 )
 
-// Stack implementation for the neo-go virtual machine. The stack implements
-// a double linked list where its semantics are first in first out.
-// To simplify the implementation, internally a Stack s is implemented as a
-// ring, such that &s.top is both the next element of the last element s.Back()
-// and the previous element of the first element s.Top().
-//
-// s.Push(0)
-// s.Push(1)
-// s.Push(2)
-//
-// [ 2 ] > top
-// [ 1 ]
-// [ 0 ] > back
-//
-// s.Pop() > 2
-//
-// [ 1 ]
-// [ 0 ]
+// Stack implementation for the neo-go virtual machine. The stack with its LIFO
+// semantics is emulated from simple slice where the top of the stack corresponds
+// to the latest element of this slice. Pushes are appends to this slice, pops are
+// slice resizes.
 
-// Element represents an element in the double linked list (the stack),
-// which will hold the underlying stackitem.Item.
+// Element represents an element on the stack, technically it's a wrapper around
+// stackitem.Item interface to provide some API simplification for VM.
 type Element struct {
-	value      stackitem.Item
-	next, prev *Element
-	stack      *Stack
+	value stackitem.Item
 }
 
 // NewElement returns a new Element object, with its underlying value inferred
 // to the corresponding type.
-func NewElement(v interface{}) *Element {
-	return &Element{
-		value: stackitem.Make(v),
-	}
-}
-
-// Next returns the next element in the stack.
-func (e *Element) Next() *Element {
-	if elem := e.next; e.stack != nil && elem != &e.stack.top {
-		return elem
-	}
-	return nil
-}
-
-// Prev returns the previous element in the stack.
-func (e *Element) Prev() *Element {
-	if elem := e.prev; e.stack != nil && elem != &e.stack.top {
-		return elem
-	}
-	return nil
+func NewElement(v interface{}) Element {
+	return Element{stackitem.Make(v)}
 }
 
 // Item returns Item contained in the element.
-func (e *Element) Item() stackitem.Item {
+func (e Element) Item() stackitem.Item {
 	return e.value
 }
 
 // Value returns value of the Item contained in the element.
-func (e *Element) Value() interface{} {
+func (e Element) Value() interface{} {
 	return e.value.Value()
 }
 
 // BigInt attempts to get the underlying value of the element as a big integer.
 // Will panic if the assertion failed which will be caught by the VM.
-func (e *Element) BigInt() *big.Int {
+func (e Element) BigInt() *big.Int {
 	val, err := e.value.TryInteger()
 	if err != nil {
 		panic(err)
@@ -82,7 +48,7 @@ func (e *Element) BigInt() *big.Int {
 
 // Bool converts an underlying value of the element to a boolean if it's
 // possible to do so, it will panic otherwise.
-func (e *Element) Bool() bool {
+func (e Element) Bool() bool {
 	b, err := e.value.TryBool()
 	if err != nil {
 		panic(err)
@@ -92,7 +58,7 @@ func (e *Element) Bool() bool {
 
 // Bytes attempts to get the underlying value of the element as a byte array.
 // Will panic if the assertion failed which will be caught by the VM.
-func (e *Element) Bytes() []byte {
+func (e Element) Bytes() []byte {
 	bs, err := e.value.TryBytes()
 	if err != nil {
 		panic(err)
@@ -102,7 +68,7 @@ func (e *Element) Bytes() []byte {
 
 // BytesOrNil attempts to get the underlying value of the element as a byte array or nil.
 // Will panic if the assertion failed which will be caught by the VM.
-func (e *Element) BytesOrNil() []byte {
+func (e Element) BytesOrNil() []byte {
 	if _, ok := e.value.(stackitem.Null); ok {
 		return nil
 	}
@@ -115,7 +81,7 @@ func (e *Element) BytesOrNil() []byte {
 
 // String attempts to get string from the element value.
 // It is assumed to be use in interops and panics if string is not a valid UTF-8 byte sequence.
-func (e *Element) String() string {
+func (e Element) String() string {
 	s, err := stackitem.ToString(e.value)
 	if err != nil {
 		panic(err)
@@ -126,7 +92,7 @@ func (e *Element) String() string {
 // Array attempts to get the underlying value of the element as an array of
 // other items. Will panic if the item type is different which will be caught
 // by the VM.
-func (e *Element) Array() []stackitem.Item {
+func (e Element) Array() []stackitem.Item {
 	switch t := e.value.(type) {
 	case *stackitem.Array:
 		return t.Value().([]stackitem.Item)
@@ -139,7 +105,7 @@ func (e *Element) Array() []stackitem.Item {
 
 // Interop attempts to get the underlying value of the element
 // as an interop item.
-func (e *Element) Interop() *stackitem.Interop {
+func (e Element) Interop() *stackitem.Interop {
 	switch t := e.value.(type) {
 	case *stackitem.Interop:
 		return t
@@ -148,12 +114,11 @@ func (e *Element) Interop() *stackitem.Interop {
 	}
 }
 
-// Stack represents a Stack backed by a double linked list.
+// Stack represents a Stack backed by a slice of Elements.
 type Stack struct {
-	top  Element
-	name string
-	len  int
-	refs *refCounter
+	elems []Element
+	name  string
+	refs  *refCounter
 }
 
 // NewStack returns a new stack name by the given name.
@@ -163,64 +128,43 @@ func NewStack(n string) *Stack {
 
 func newStack(n string, refc *refCounter) *Stack {
 	s := new(Stack)
+	s.elems = make([]Element, 0, 16) // Most of uses are expected to fit into 16 elements.
 	initStack(s, n, refc)
 	return s
 }
 func initStack(s *Stack, n string, refc *refCounter) {
 	s.name = n
 	s.refs = refc
-	s.top.next = &s.top
-	s.top.prev = &s.top
-	s.len = 0
+	s.Clear()
 }
 
 // Clear clears all elements on the stack and set its length to 0.
 func (s *Stack) Clear() {
-	s.top.next = &s.top
-	s.top.prev = &s.top
-	s.len = 0
+	if s.elems != nil {
+		s.elems = s.elems[:0]
+	}
 }
 
 // Len returns the number of elements that are on the stack.
 func (s *Stack) Len() int {
-	return s.len
-}
-
-// insert inserts the element after element (at) on the stack.
-func (s *Stack) insert(e, at *Element) *Element {
-	// If we insert an element that is already popped from this stack,
-	// we need to clean it up, there are still pointers referencing to it.
-	if e.stack == s {
-		e = NewElement(e.value)
-	}
-
-	n := at.next
-	at.next = e
-	e.prev = at
-	e.next = n
-	n.prev = e
-	e.stack = s
-	s.len++
-
-	s.refs.Add(e.value)
-
-	return e
+	return len(s.elems)
 }
 
 // InsertAt inserts the given item (n) deep on the stack.
-// Be very careful using it and _always_ check both e and n before invocation
-// as it will silently do wrong things otherwise.
-func (s *Stack) InsertAt(e *Element, n int) *Element {
-	before := s.Peek(n - 1)
-	if before == nil {
-		return nil
-	}
-	return s.insert(e, before)
+// Be very careful using it and _always_ check n before invocation
+// as it will panic otherwise.
+func (s *Stack) InsertAt(e Element, n int) {
+	l := len(s.elems)
+	s.elems = append(s.elems, e)
+	copy(s.elems[l-n+1:], s.elems[l-n:l])
+	s.elems[l-n] = e
+	s.refs.Add(e.value)
 }
 
 // Push pushes the given element on the stack.
-func (s *Stack) Push(e *Element) {
-	s.insert(e, &s.top)
+func (s *Stack) Push(e Element) {
+	s.elems = append(s.elems, e)
+	s.refs.Add(e.value)
 }
 
 // PushVal pushes the given value on the stack. It will infer the
@@ -229,63 +173,49 @@ func (s *Stack) PushVal(v interface{}) {
 	s.Push(NewElement(v))
 }
 
-// Pop removes and returns the element on top of the stack.
-func (s *Stack) Pop() *Element {
-	return s.Remove(s.Top())
+// Pop removes and returns the element on top of the stack. Panics if stack is
+// empty.
+func (s *Stack) Pop() Element {
+	l := len(s.elems)
+	e := s.elems[l-1]
+	s.elems = s.elems[:l-1]
+	s.refs.Remove(e.value)
+	return e
 }
 
 // Top returns the element on top of the stack. Nil if the stack
 // is empty.
-func (s *Stack) Top() *Element {
-	if s.len == 0 {
-		return nil
+func (s *Stack) Top() Element {
+	if len(s.elems) == 0 {
+		return Element{}
 	}
-	return s.top.next
+	return s.elems[len(s.elems)-1]
 }
 
 // Back returns the element at the end of the stack. Nil if the stack
 // is empty.
-func (s *Stack) Back() *Element {
-	if s.len == 0 {
-		return nil
+func (s *Stack) Back() Element {
+	if len(s.elems) == 0 {
+		return Element{}
 	}
-	return s.top.prev
+	return s.elems[0]
 }
 
 // Peek returns the element (n) far in the stack beginning from
-// the top of the stack.
-// 	n = 0 => will return the element on top of the stack.
-func (s *Stack) Peek(n int) *Element {
-	i := 0
-	for e := s.Top(); e != nil; e = e.Next() {
-		if n == i {
-			return e
-		}
-		i++
-	}
-	return nil
+// the top of the stack. For n == 0 it's effectively the same as Top,
+// but it'll panic if the stack is empty.
+func (s *Stack) Peek(n int) Element {
+	n = len(s.elems) - n - 1
+	return s.elems[n]
 }
 
 // RemoveAt removes the element (n) deep on the stack beginning
-// from the top of the stack.
-func (s *Stack) RemoveAt(n int) *Element {
-	return s.Remove(s.Peek(n))
-}
-
-// Remove removes and returns the given element from the stack.
-func (s *Stack) Remove(e *Element) *Element {
-	if e == nil {
-		return nil
-	}
-	e.prev.next = e.next
-	e.next.prev = e.prev
-	e.next = nil // avoid memory leaks.
-	e.prev = nil // avoid memory leaks.
-	e.stack = nil
-	s.len--
-
+// from the top of the stack. Panics if called with out of bounds n.
+func (s *Stack) RemoveAt(n int) Element {
+	l := len(s.elems)
+	e := s.elems[l-1-n]
+	s.elems = append(s.elems[:l-1-n], s.elems[l-n:]...)
 	s.refs.Remove(e.value)
-
 	return e
 }
 
@@ -293,15 +223,9 @@ func (s *Stack) Remove(e *Element) *Element {
 // Dup is used for copying elements on to the top of its own stack.
 // 	s.Push(s.Peek(0)) // will result in unexpected behaviour.
 // 	s.Push(s.Dup(0)) // is the correct approach.
-func (s *Stack) Dup(n int) *Element {
+func (s *Stack) Dup(n int) Element {
 	e := s.Peek(n)
-	if e == nil {
-		return nil
-	}
-
-	return &Element{
-		value: e.value.Dup(),
-	}
+	return Element{e.value.Dup()}
 }
 
 // Iter iterates over all the elements int the stack, starting from the top
@@ -309,9 +233,9 @@ func (s *Stack) Dup(n int) *Element {
 // 	s.Iter(func(elem *Element) {
 //		// do something with the element.
 // 	})
-func (s *Stack) Iter(f func(*Element)) {
-	for e := s.Top(); e != nil; e = e.Next() {
-		f(e)
+func (s *Stack) Iter(f func(Element)) {
+	for i := len(s.elems) - 1; i >= 0; i-- {
+		f(s.elems[i])
 	}
 }
 
@@ -320,9 +244,9 @@ func (s *Stack) Iter(f func(*Element)) {
 // 	s.IterBack(func(elem *Element) {
 //		// do something with the element.
 // 	})
-func (s *Stack) IterBack(f func(*Element)) {
-	for e := s.Back(); e != nil; e = e.Prev() {
-		f(e)
+func (s *Stack) IterBack(f func(Element)) {
+	for i := 0; i < len(s.elems); i++ {
+		f(s.elems[i])
 	}
 }
 
@@ -331,37 +255,27 @@ func (s *Stack) Swap(n1, n2 int) error {
 	if n1 < 0 || n2 < 0 {
 		return errors.New("negative index")
 	}
-	if n1 >= s.len || n2 >= s.len {
+	l := len(s.elems)
+	if n1 >= l || n2 >= l {
 		return errors.New("too big index")
 	}
-	if n1 == n2 {
-		return nil
-	}
-	s.swap(n1, n2)
+	s.elems[l-n1-1], s.elems[l-n2-1] = s.elems[l-n2-1], s.elems[l-n1-1]
 	return nil
 }
 
-func (s *Stack) swap(n1, n2 int) {
-	a := s.Peek(n1)
-	b := s.Peek(n2)
-	a.value, b.value = b.value, a.value
-}
-
 // ReverseTop reverses top n items of the stack.
 func (s *Stack) ReverseTop(n int) error {
+	l := len(s.elems)
 	if n < 0 {
 		return errors.New("negative index")
-	} else if n > s.len {
+	} else if n > l {
 		return errors.New("too big index")
 	} else if n <= 1 {
 		return nil
 	}
 
-	a, b := s.Peek(0), s.Peek(n-1)
-	for i := 0; i < n/2; i++ {
-		a.value, b.value = b.value, a.value
-		a = a.Next()
-		b = b.Prev()
+	for i, j := l-n, l-1; i <= j; i, j = i+1, j-1 {
+		s.elems[i], s.elems[j] = s.elems[j], s.elems[i]
 	}
 	return nil
 }
@@ -373,24 +287,16 @@ func (s *Stack) Roll(n int) error {
 	if n < 0 {
 		return errors.New("negative index")
 	}
-	if n >= s.len {
+	l := len(s.elems)
+	if n >= l {
 		return errors.New("too big index")
 	}
 	if n == 0 {
 		return nil
 	}
-	top := s.Peek(0)
-	e := s.Peek(n)
-
-	e.prev.next = e.next
-	e.next.prev = e.prev
-
-	top.prev = e
-	e.next = top
-
-	e.prev = &s.top
-	s.top.next = e
-
+	e := s.elems[l-1-n]
+	copy(s.elems[l-1-n:], s.elems[l-n:])
+	s.elems[l-1] = e
 	return nil
 }
 
@@ -399,10 +305,10 @@ func (s *Stack) Roll(n int) error {
 func (s *Stack) PopSigElements() ([][]byte, error) {
 	var num int
 	var elems [][]byte
-	item := s.Pop()
-	if item == nil {
+	if s.Len() == 0 {
 		return nil, fmt.Errorf("nothing on the stack")
 	}
+	item := s.Pop()
 	switch item.value.(type) {
 	case *stackitem.Array:
 		num = len(item.Array())
@@ -432,8 +338,8 @@ func (s *Stack) PopSigElements() ([][]byte, error) {
 
 // ToArray converts stack to an array of stackitems with top item being the last.
 func (s *Stack) ToArray() []stackitem.Item {
-	items := make([]stackitem.Item, 0, s.len)
-	s.IterBack(func(e *Element) {
+	items := make([]stackitem.Item, 0, len(s.elems))
+	s.IterBack(func(e Element) {
 		items = append(items, e.Item())
 	})
 	return items
diff --git a/pkg/vm/stack_test.go b/pkg/vm/stack_test.go
index 885ccbf08..e596a5eef 100644
--- a/pkg/vm/stack_test.go
+++ b/pkg/vm/stack_test.go
@@ -76,9 +76,6 @@ func TestRemoveAt(t *testing.T) {
 
 	elem := s.RemoveAt(8)
 	assert.Equal(t, elems[1], elem)
-	assert.Nil(t, elem.prev)
-	assert.Nil(t, elem.next)
-	assert.Nil(t, elem.stack)
 
 	// Test if the pointers are moved.
 	assert.Equal(t, elems[0], s.Peek(8))
@@ -147,8 +144,6 @@ func TestRemoveLastElement(t *testing.T) {
 	}
 	elem := s.RemoveAt(1)
 	assert.Equal(t, elems[0], elem)
-	assert.Nil(t, elem.prev)
-	assert.Nil(t, elem.next)
 	assert.Equal(t, 1, s.Len())
 }
 
@@ -163,7 +158,7 @@ func TestIterAfterRemove(t *testing.T) {
 	s.RemoveAt(0)
 
 	i := 0
-	s.Iter(func(elem *Element) {
+	s.Iter(func(_ Element) {
 		i++
 	})
 	assert.Equal(t, len(elems)-1, i)
@@ -180,15 +175,16 @@ func TestIteration(t *testing.T) {
 	}
 	assert.Equal(t, len(elems), s.Len())
 
-	iteratedElems := make([]*Element, 0)
+	iteratedElems := make([]Element, 0)
 
-	s.Iter(func(elem *Element) {
+	s.Iter(func(elem Element) {
 		iteratedElems = append(iteratedElems, elem)
 	})
+
 	// Top to bottom order of iteration.
-	poppedElems := make([]*Element, 0)
-	for elem := s.Pop(); elem != nil; elem = s.Pop() {
-		poppedElems = append(poppedElems, elem)
+	poppedElems := make([]Element, 0)
+	for s.Len() != 0 {
+		poppedElems = append(poppedElems, s.Pop())
 	}
 	assert.Equal(t, poppedElems, iteratedElems)
 }
@@ -204,9 +200,9 @@ func TestBackIteration(t *testing.T) {
 	}
 	assert.Equal(t, len(elems), s.Len())
 
-	iteratedElems := make([]*Element, 0)
+	iteratedElems := make([]Element, 0)
 
-	s.IterBack(func(elem *Element) {
+	s.IterBack(func(elem Element) {
 		iteratedElems = append(iteratedElems, elem)
 	})
 	// Bottom to the top order of iteration.
@@ -331,6 +327,25 @@ func TestRoll(t *testing.T) {
 	assert.Equal(t, int64(1), s.Pop().BigInt().Int64())
 }
 
+func TestInsertAt(t *testing.T) {
+	s := NewStack("stack")
+	s.PushVal(1)
+	s.PushVal(2)
+	s.PushVal(3)
+	s.PushVal(4)
+	s.PushVal(5)
+
+	e := s.Dup(1) // it's `4`
+	s.InsertAt(e, 3)
+
+	assert.Equal(t, int64(5), s.Peek(0).BigInt().Int64())
+	assert.Equal(t, int64(4), s.Peek(1).BigInt().Int64())
+	assert.Equal(t, int64(3), s.Peek(2).BigInt().Int64())
+	assert.Equal(t, int64(4), s.Peek(3).BigInt().Int64())
+	assert.Equal(t, int64(2), s.Peek(4).BigInt().Int64())
+	assert.Equal(t, int64(1), s.Peek(5).BigInt().Int64())
+}
+
 func TestPopSigElements(t *testing.T) {
 	s := NewStack("test")
 
@@ -369,8 +384,8 @@ func TestPopSigElements(t *testing.T) {
 	assert.Equal(t, z, [][]byte{b1, b2})
 }
 
-func makeElements(n int) []*Element {
-	elems := make([]*Element, n)
+func makeElements(n int) []Element {
+	elems := make([]Element, n)
 	for i := 0; i < n; i++ {
 		elems[i] = NewElement(i)
 	}
diff --git a/pkg/vm/vm.go b/pkg/vm/vm.go
index 3ad7b11a6..35b89dfa3 100644
--- a/pkg/vm/vm.go
+++ b/pkg/vm/vm.go
@@ -328,11 +328,10 @@ func (v *VM) Context() *Context {
 // PopResult is used to pop the first item of the evaluation stack. This allows
 // us to test compiler and vm in a bi-directional way.
 func (v *VM) PopResult() interface{} {
-	e := v.estack.Pop()
-	if e != nil {
-		return e.Value()
+	if v.estack.Len() == 0 {
+		return nil
 	}
-	return nil
+	return v.estack.Pop().Value()
 }
 
 // Stack returns json formatted representation of the given stack.
@@ -448,8 +447,8 @@ func (v *VM) StepOut() error {
 		v.state = NoneState
 	}
 
-	expSize := v.istack.len
-	for v.state == NoneState && v.istack.len >= expSize {
+	expSize := v.istack.Len()
+	for v.state == NoneState && v.istack.Len() >= expSize {
 		err = v.StepInto()
 	}
 	if v.state == NoneState {
@@ -470,10 +469,10 @@ func (v *VM) StepOver() error {
 		v.state = NoneState
 	}
 
-	expSize := v.istack.len
+	expSize := v.istack.Len()
 	for {
 		err = v.StepInto()
-		if !(v.state == NoneState && v.istack.len > expSize) {
+		if !(v.state == NoneState && v.istack.Len() > expSize) {
 			break
 		}
 	}
@@ -739,20 +738,20 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 		v.estack.Pop()
 
 	case opcode.NIP:
-		elem := v.estack.RemoveAt(1)
-		if elem == nil {
+		if v.estack.Len() < 2 {
 			panic("no second element found")
 		}
+		_ = v.estack.RemoveAt(1)
 
 	case opcode.XDROP:
 		n := int(v.estack.Pop().BigInt().Int64())
 		if n < 0 {
 			panic("invalid length")
 		}
-		e := v.estack.RemoveAt(n)
-		if e == nil {
+		if v.estack.Len() < n+1 {
 			panic("bad index")
 		}
+		_ = v.estack.RemoveAt(n)
 
 	case opcode.CLEAR:
 		v.estack.Clear()
@@ -761,10 +760,10 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 		v.estack.Push(v.estack.Dup(0))
 
 	case opcode.OVER:
-		a := v.estack.Dup(1)
-		if a == nil {
+		if v.estack.Len() < 2 {
 			panic("no second element found")
 		}
+		a := v.estack.Dup(1)
 		v.estack.Push(a)
 
 	case opcode.PICK:
@@ -772,20 +771,17 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 		if n < 0 {
 			panic("negative stack item returned")
 		}
-		a := v.estack.Dup(n)
-		if a == nil {
+		if v.estack.Len() < n+1 {
 			panic("no nth element found")
 		}
+		a := v.estack.Dup(n)
 		v.estack.Push(a)
 
 	case opcode.TUCK:
-		a := v.estack.Dup(0)
-		if a == nil {
-			panic("no top-level element found")
-		}
 		if v.estack.Len() < 2 {
-			panic("can't TUCK with a one-element stack")
+			panic("too short stack to TUCK")
 		}
+		a := v.estack.Dup(0)
 		v.estack.InsertAt(a, 2)
 
 	case opcode.SWAP:
@@ -821,10 +817,8 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 
 	// Bit operations.
 	case opcode.INVERT:
-		// inplace
-		e := v.estack.Peek(0)
-		i := e.BigInt()
-		e.value = stackitem.Make(new(big.Int).Not(i))
+		i := v.estack.Pop().BigInt()
+		v.estack.PushVal(new(big.Int).Not(i))
 
 	case opcode.AND:
 		b := v.estack.Pop().BigInt()
@@ -842,14 +836,11 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 		v.estack.PushVal(new(big.Int).Xor(b, a))
 
 	case opcode.EQUAL, opcode.NOTEQUAL:
+		if v.estack.Len() < 2 {
+			panic("need a pair of elements on the stack")
+		}
 		b := v.estack.Pop()
-		if b == nil {
-			panic("no top-level element found")
-		}
 		a := v.estack.Pop()
-		if a == nil {
-			panic("no second-to-the-top element found")
-		}
 		v.estack.PushVal(a.value.Equals(b.value) == (op == opcode.EQUAL))
 
 	// Numeric operations.
@@ -1100,7 +1091,7 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 			if index < 0 {
 				panic("invalid key")
 			}
-			v.estack.Push(&Element{value: t.Value().([]stackitem.MapElement)[index].Value.Dup()})
+			v.estack.Push(Element{value: t.Value().([]stackitem.MapElement)[index].Value.Dup()})
 		default:
 			arr := obj.Bytes()
 			if index < 0 || index >= len(arr) {
@@ -1318,13 +1309,13 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 		}
 
 	case opcode.NEWMAP:
-		v.estack.Push(&Element{value: stackitem.NewMap()})
+		v.estack.Push(Element{value: stackitem.NewMap()})
 
 	case opcode.KEYS:
-		item := v.estack.Pop()
-		if item == nil {
+		if v.estack.Len() == 0 {
 			panic("no argument")
 		}
+		item := v.estack.Pop()
 
 		m, ok := item.value.(*stackitem.Map)
 		if !ok {
@@ -1338,10 +1329,10 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 		v.estack.PushVal(arr)
 
 	case opcode.VALUES:
-		item := v.estack.Pop()
-		if item == nil {
+		if v.estack.Len() == 0 {
 			panic("no argument")
 		}
+		item := v.estack.Pop()
 
 		var arr []stackitem.Item
 		switch t := item.value.(type) {
@@ -1363,13 +1354,13 @@ func (v *VM) execute(ctx *Context, op opcode.Opcode, parameter []byte) (err erro
 		v.estack.PushVal(arr)
 
 	case opcode.HASKEY:
+		if v.estack.Len() < 2 {
+			panic("not enough arguments")
+		}
 		key := v.estack.Pop()
 		validateMapKey(key)
 
 		c := v.estack.Pop()
-		if c == nil {
-			panic("no value found")
-		}
 		switch t := c.value.(type) {
 		case *stackitem.Array, *stackitem.Struct:
 			index := key.BigInt().Int64()
@@ -1559,16 +1550,15 @@ func calcJumpOffset(ctx *Context, parameter []byte) (int, int, error) {
 }
 
 func (v *VM) handleException() {
-	pop := 0
-	ictxv := v.istack.Peek(0)
-	ictx := ictxv.Value().(*Context)
-	for ictx != nil {
-		e := ictx.tryStack.Peek(0)
-		for e != nil {
+	for pop := 0; pop < v.istack.Len(); pop++ {
+		ictxv := v.istack.Peek(pop)
+		ictx := ictxv.Value().(*Context)
+		for j := 0; j < ictx.tryStack.Len(); j++ {
+			e := ictx.tryStack.Peek(j)
 			ectx := e.Value().(*exceptionHandlingContext)
 			if ectx.State == eFinally || (ectx.State == eCatch && !ectx.HasFinally()) {
 				ictx.tryStack.Pop()
-				e = ictx.tryStack.Peek(0)
+				j = -1
 				continue
 			}
 			for i := 0; i < pop; i++ {
@@ -1586,12 +1576,6 @@ func (v *VM) handleException() {
 			}
 			return
 		}
-		pop++
-		ictxv = ictxv.Next()
-		if ictxv == nil {
-			break
-		}
-		ictx = ictxv.Value().(*Context)
 	}
 	throwUnhandledException(v.uncaughtException)
 }
@@ -1753,17 +1737,18 @@ func makeArrayOfType(n int, typ stackitem.Type) []stackitem.Item {
 	return items
 }
 
-func validateMapKey(key *Element) {
-	if key == nil {
+func validateMapKey(key Element) {
+	item := key.Item()
+	if item == nil {
 		panic("no key found")
 	}
-	if err := stackitem.IsValidMapKey(key.Item()); err != nil {
+	if err := stackitem.IsValidMapKey(item); err != nil {
 		panic(err)
 	}
 }
 
 func (v *VM) checkInvocationStackSize() {
-	if v.istack.len >= MaxInvocationStackSize {
+	if v.istack.Len() >= MaxInvocationStackSize {
 		panic("invocation stack is too big")
 	}
 }
@@ -1785,7 +1770,7 @@ func (v *VM) GetCallingScriptHash() util.Uint160 {
 
 // GetEntryScriptHash implements ScriptHashGetter interface.
 func (v *VM) GetEntryScriptHash() util.Uint160 {
-	return v.getContextScriptHash(v.istack.len - 1)
+	return v.getContextScriptHash(v.istack.Len() - 1)
 }
 
 // GetCurrentScriptHash implements ScriptHashGetter interface.
diff --git a/pkg/vm/vm_test.go b/pkg/vm/vm_test.go
index 3391ddeca..63ca28fd1 100644
--- a/pkg/vm/vm_test.go
+++ b/pkg/vm/vm_test.go
@@ -1176,7 +1176,7 @@ func TestPICKITEMDupMap(t *testing.T) {
 	vm := load(prog)
 	m := stackitem.NewMap()
 	m.Add(stackitem.Make(42), stackitem.Make(-1))
-	vm.estack.Push(&Element{value: m})
+	vm.estack.Push(Element{value: m})
 	runVM(t, vm)
 	assert.Equal(t, 2, vm.estack.Len())
 	assert.Equal(t, int64(1), vm.estack.Pop().BigInt().Int64())
@@ -1245,7 +1245,7 @@ func TestSETITEMBigMapGood(t *testing.T) {
 	for i := 0; i < MaxStackSize-3; i++ {
 		m.Add(stackitem.Make(i), stackitem.Make(i))
 	}
-	vm.estack.Push(&Element{value: m})
+	vm.estack.Push(Element{value: m})
 	vm.estack.PushVal(0)
 	vm.estack.PushVal(0)
 
@@ -1274,7 +1274,7 @@ func TestKEYSMap(t *testing.T) {
 	m := stackitem.NewMap()
 	m.Add(stackitem.Make(5), stackitem.Make(6))
 	m.Add(stackitem.Make([]byte{0, 1}), stackitem.Make(6))
-	vm.estack.Push(&Element{value: m})
+	vm.estack.Push(Element{value: m})
 
 	runVM(t, vm)
 	assert.Equal(t, 1, vm.estack.Len())
@@ -1298,7 +1298,7 @@ func TestVALUESMap(t *testing.T) {
 	m := stackitem.NewMap()
 	m.Add(stackitem.Make(5), stackitem.Make([]byte{2, 3}))
 	m.Add(stackitem.Make([]byte{0, 1}), stackitem.Make([]stackitem.Item{}))
-	vm.estack.Push(&Element{value: m})
+	vm.estack.Push(Element{value: m})
 
 	runVM(t, vm)
 	assert.Equal(t, 1, vm.estack.Len())
@@ -1880,7 +1880,7 @@ func TestREVERSEITEMSGoodStruct(t *testing.T) {
 		for i := range elements {
 			arr[i] = stackitem.Make(elements[i])
 		}
-		vm.estack.Push(&Element{value: stackitem.NewStruct(arr)})
+		vm.estack.Push(Element{value: stackitem.NewStruct(arr)})
 
 		runVM(t, vm)
 		assert.Equal(t, 2, vm.estack.Len())
@@ -1944,8 +1944,8 @@ func TestREMOVEMap(t *testing.T) {
 	m := stackitem.NewMap()
 	m.Add(stackitem.Make(5), stackitem.Make(3))
 	m.Add(stackitem.Make([]byte{0, 1}), stackitem.Make([]byte{2, 3}))
-	vm.estack.Push(&Element{value: m})
-	vm.estack.Push(&Element{value: m})
+	vm.estack.Push(Element{value: m})
+	vm.estack.Push(Element{value: m})
 	vm.estack.PushVal(stackitem.Make(5))
 
 	runVM(t, vm)