vm.bib

@inproceedings{Deutsch:1984:IC,
  abstract = {The Smalltalk-80* programming language includes dynamic storage allocation, full upward funargs, and universally polymorphic procedures; the Smalltalk-80 programming system features interactive execution with incremental compilation, and implementation portability. These features of modern programming systems are among the most difficult to implement efficiently, even individually. A new implementation of the Smalltalk-80 system, hosted on a small microprocessor-based computer, achieves high performance while retaining complete (object code) compatibility with existing implementations. This paper discusses the most significant optimization techniques developed over the course of the project, many of which are applicable to other languages. The key idea is to represent certain runtime state (both code and data) in more than one form, and to convert between forms when needed.},
  author = {Deutsch, L. Peter and Schiffman, Allan M.},
  booktitle = {POPL '84: Proceedings of the 11th ACM SIGACT-SIGPLAN symposium on Principles of programming languages},
  description = {Efficient Implementation of the Smalltalk-80 System},
  doi = {10.1145/800017.800542},
  isbn = {0-89791-125-3},
  pages = {297--302},
  publisher = {ACM},
  series = {POPL'84},
  title = {Efficient Implementation of the Smalltalk-80 System},
  year = 1984,
  url = "http://web.cs.ucla.edu/~palsberg/course/cs232/papers/DeutschSchiffman-popl84.pdf",

  vm:concepts = {Lookup Caches, Interpretation},
  vm:shortTitle = {Inline Caches}
}

@inproceedings{Chambers:89:Self,
  abstract = {We have developed and implemented techniques that double the performance of dynamically-typed object-oriented languages. Our SELF implementation runs twice as fast as the fastest Smalltalk implementation, despite SELF's lack of classes and explicit variables. To compensate for the absence of classes, our system uses implementation-level maps to transparently group objects cloned from the same prototype, providing data type information and eliminating the apparent space overhead for prototype-based systems. To compensate for dynamic typing, user-defined control structures, and the lack of explicit variables, our system dynamically compiles multiple versions of a source method, each customized according to its receiver's map. Within each version the type of the receiver is fixed, and thus the compiler can statically bind and inline all messages sent to self. Message splitting and type prediction extract and preserve even more static type information, allowing the compiler to inline many other messages. Inlining dramatically improves performance and eliminates the need to hard-wire low-level methods such as +,==, and ifTrue:. Despite inlining and other optimizations, our system still supports interactive programming environments. The system traverses internal dependency lists to invalidate all compiled methods affected by a programming change. The debugger reconstructs inlined stack frames from compiler-generated debugging information, making inlining invisible to the SELF programmer.},
  author = {Chambers, Craig and Ungar, David and Lee, Elgin},
  booktitle = {Proceedings on Object-Oriented Programming Systems, Languages and Applications},
  doi = {10.1145/74878.74884},
  isbn = {0-89791-333-7},
  issn = {0362-1340},
  month = {October},
  pages = {49--70},
  publisher = {ACM},
  series = {OOPSLA'89},
  title = {An Efficient Implementation of SELF a Dynamically-Typed Object-Oriented Language Based on Prototypes},
  year = 1989,
  url = {http://www.selflanguage.org/_static/published/implementation.pdf},

  vm:concepts = {Object Representation},
  vm:shortTitle = {SELF & Maps},
  vm:edge:uses = {
    Deutsch:1984:IC + compilation;
    Compilation + method JIT},
  vm:edge:extends = {ObjectRepresentation + efficient access and lower memory footprint}
}

@inproceedings{Hoelzle:91:PIC,
  abstract = {Polymorphic inline caches (PICs) provide a new way to reduce the overhead of polymorphic message sends by extending inline caches to include more than one cached lookup result per call site. For a set of typical object-oriented SELF programs, PICs achieve a median speedup of 11%. As an important side effect, PICs collect type information by recording all of the receiver types actually used at a given call site. The compiler can exploit this type information to generate better code when recompiling a method. An experimental version of such a system achieves a median speedup of 27% for our set of SELF programs, reducing the number of non-inlined message sends by a factor of two. Implementations of dynamically-typed object-oriented languages have been limited by the paucity of type information available to the compiler. The abundance of the type information provided by PICs suggests a new compilation approach for these languages, adaptive compilation . Such compilers may succeed in generating very efficient code for the time-critical parts of a program without incurring distracting compilation pauses.},
  author = {Hölzle, Urs and Chambers, Craig and Ungar, David},
  booktitle = {ECOOP '91: European Conference on Object-Oriented Programming},
  description = {Optimizing Dynamically-Typed Object-Oriented Languages With Polymorphic Inline Caches},
  doi = {10.1007/BFb0057013},
  isbn = {3-540-54262-0},
  pages = {21--38},
  publisher = {Springer},
  series = {ECOOP'91},
  title = {Optimizing Dynamically-Typed Object-Oriented Languages With Polymorphic Inline Caches},
  volume = 512,
  year = 1991,
  url = {http://bibliography.selflanguage.org/_static/pics.pdf},

  vm:concepts = {Lookup Caches},
  vm:shortTitle = {Polymorphic Inline Caches},
  vm:edge:extends = {Deutsch:1984:IC + k-size cache}
}

@inproceedings{Holzle:1992:DOC,
  author = {H\"{o}lzle, Urs and Chambers, Craig and Ungar, David},
  booktitle = {Proceedings of the ACM SIGPLAN 1992 conference on Programming language design and implementation},
  doi = {10.1145/143095.143114},
  isbn = {0-89791-475-9},
  numpages = {12},
  pages = {32--43},
  publisher = {ACM},
  series = {PLDI '92},
  title = {Debugging Optimized Code with Dynamic Deoptimization},
  year = 1992,
  url = {http://bibliography.selflanguage.org/_static/dynamic-deoptimization.pdf},

  vm:shortTitle = {Dynamic Deoptimization},
  vm:edge:extends = {Chambers:89:Self + deoptimization}
}

@inproceedings{Ertl:1995:Stack,
  abstract = {An interpreter can spend a significant part of its execution time on accessing arguments of virtual machine instructions. This paper explores two methods to reduce this overhead for virtual stack machines by caching top-of-stack values in (real machine) registers. The dynamic method is based on having, for every possible state of the cache, one specialized version of the whole interpreter; the execution of an instruction usually changes the state of the cache and the next instruction is executed in the version corresponding to the new state. In the static method a state machine that keeps track of the cache state is added to the compiler. Common instructions exist in specialized versions for several states, but it is not necessary to have a version of every instruction for every cache state. Stack manipulation instructions are optimized away.},
  author = {Ertl, M. Anton},
  booktitle = {Proceedings of the ACM SIGPLAN 1995 Conference on Programming Language Design and Implementation},
  doi = {10.1145/207110.207165},
  issn = {0362-1340},
  pages = {315--327},
  publisher = {ACM},
  series = {PLDI'95},
  title = {Stack Caching for Interpreters},
  url = {http://www.complang.tuwien.ac.at/papers/ertl95pldi.ps.gz},
  year = 1995,

  vm:concepts = {Interpretation},
  vm:shortTitle = {Top of Stack Caching},
  vm:edge:extends = {Interpretation + optimization}
}

@inproceedings{Burke:1999:JDO,
  author = {Burke, Michael G. and Choi, Jong-Deok and Fink, Stephen and Grove, David and Hind, Michael and Sarkar, Vivek and Serrano, Mauricio J. and Sreedhar, V. C. and Srinivasan, Harini and Whaley, John},
  booktitle = {Proceedings of the ACM 1999 Conference on Java Grande},
  doi = {10.1145/304065.304113},
  isbn = {1-58113-161-5},
  numpages = {13},
  pages = {129--141},
  publisher = {ACM},
  series = {JAVA'99},
  title = {The Jalapeño Dynamic Optimizing Compiler for Java},
  year = 1999,
  url = {https://pdfs.semanticscholar.org/8738/ba4b09cac68ad1ab7d036a6d5e40a17ee2c7.pdf},

  vm:shortTitle = {Adaptive Compilation},
  vm:edge:extends = {Chambers:89:Self + adaptive optimization}
}

@inproceedings{Bala:2000:DTD,
  abstract = {We describe the design and implementation of Dynamo, a software dynamic optimization system that is capable of transparently improving the performance of a native instruction stream as it executes on the processor. The input native instruction stream to Dynamo can be dynamically generated (by a JIT for example), or it can come from the execution of a statically compiled native binary. This paper evaluates the Dynamo system in the latter, more challenging situation, in order to emphasize the limits, rather than the potential, of the system. Our experiments demonstrate that even statically optimized native binaries can be accelerated Dynamo, and often by a significant degree. For example, the average performance of -O optimized SpecInt95 benchmark binaries created by the HP product C compiler is improved to a level comparable to their -O4 optimized version running without Dynamo. Dynamo achieves this by focusing its efforts on optimization opportunities that tend to manifest only at runtime, and hence opportunities that might be difficult for a static compiler to exploit. Dynamo's operation is transparent in the sense that it does not depend on any user annotations or binary instrumentation, and does not require multiple runs, or any special compiler, operating system or hardware support. The Dynamo prototype presented here is a realistic implementation running on an HP PA-8000 workstation under the HPUX 10.20 operating system.},
  author = {Bala, Vasanth and Duesterwald, Evelyn and Banerjia, Sanjeev},
  booktitle = {Proceedings of the ACM SIGPLAN 2000 Conference on Programming Language Design and Implementation},
  doi = {10.1145/358438.349303},
  isbn = {1-58113-199-2},
  numpages = {12},
  pages = {1--12},
  publisher = {ACM},
  series = {PLDI'00},
  title = {Dynamo: A Transparent Dynamic Optimization System},
  year = 2000,
  url = "https://www.complang.tuwien.ac.at/andi/bala.pdf",

  vm:shortTitle = {Trace-based JIT},
  vm:edge:extends = {Compilation trace JIT}
}

@inproceedings{Domani:02:TLAB,
  author = {Domani, Tamar and Goldshtein, Gal and Kolodner, Elliot K. and Lewis, Ethan and Petrank, Erez and Sheinwald, Dafna},
  booktitle = {Proceedings of the Third International Symposium on Memory Management},
  description = {Thread-local heaps for Java | Proceedings of the 3rd international symposium on Memory management},
  doi = {10.1145/512429.512439},
  keywords = {Allocation Barrier Concurrency Contention Local Threads},
  publisher = {{ACM} Press},
  series = {ISMM'02},
  title = {Thread-local Heaps for Java},
  url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.5846&rep=rep1&type=pdf},
  year = 2002,

  vm:concepts = {
    Garbage Collection,
    Tracking Reachability to Reduce Synchronization},
  vm:shortTitle = {Thread-Local Heaps and GC}
}

@article{Ertl:03:DIT,
  abstract = {Interpreters designed for high general-purpose performance typically perform a large
number of indirect branches (3.2%-13% of all executed instructions in our benchmarks).
These branches consume more than half of the run-time in a number of congurations
we simulated. We evaluate how accurate various existing and proposed branch prediction
schemes are on a number of interpreters, how the mispredictions aect the performance of
the interpreters and how two dierent interpreter implementation techniques perform with
various branch predictors. We also suggest various ways in which hardware designers, C
compiler writers, and interpreter writers can improve the performance of interpreters.},
  added-at = {2010-12-06T17:29:58.000+0100},
  author = {Ertl, M. Anton and Gregg, David},
  journal = {Journal of Instruction-Level Parallelism},
  month = {November},
  pages = {1--25},
  timestamp = {2010-12-06T17:29:58.000+0100},
  title = {The Structure and Performance of Efficient Interpreters},
  url = {http://www.jilp.org/vol5/v5paper12.pdf},
  series = {J of ILP'03},
  volume = 5,
  year = 2003,

  vm:shortTitle = {Impact of Direct/Indirect Threading},
  vm:concepts = {Interpretation}
}

@article{Casey:07:SI,
  abstract = {Interpreters designed for efficiency execute a huge number of indirect branches and can spend more than half of the execution time in indirect branch mispredictions. Branch target buffers (BTBs) are the most widely available form of indirect branch prediction; however, their prediction accuracy for existing interpreters is only 2&percnt;--50&percnt;. In this article we investigate two methods for improving the prediction accuracy of BTBs for interpreters: replicating virtual machine (VM) instructions and combining sequences of VM instructions into superinstructions. We investigate static (interpreter build-time) and dynamic (interpreter runtime) variants of these techniques and compare them and several combinations of these techniques. To show their generality, we have implemented these optimizations in VMs for both Java and Forth. These techniques can eliminate nearly all of the dispatch branch mispredictions, and have other benefits, resulting in speedups by a factor of up to 4.55 over efficient threaded-code interpreters, and speedups by a factor of up to 1.34 over techniques relying on dynamic superinstructions alone.},
  author = {Casey, Kevin and Ertl, M. Anton and Gregg, David},
  doi = {10.1145/1286821.1286828},
  issn = {0164-0925},
  journal = {ACM Trans. Program. Lang. Syst.},
  number = 6,
  pages = 37,
  publisher = {ACM},
  title = {Optimizing Indirect Branch Prediction Accuracy in Virtual Machine Interpreters},
  volume = 29,
  year = 2007,
  series = {TOPLAS'07},
  url = "https://www.scss.tcd.ie/David.Gregg/papers/toplas05.pdf",

  vm:shortTitle = {Super Instructions},
  vm:concepts = {Interpretation}
}

@article{Shi:2008:SVR,
  author = {Shi, Yunhe and Casey, Kevin and Ertl, M. Anton and Gregg, David},
  doi = {10.1145/1328195.1328197},
  issn = {1544-3566},
  journal = {ACM Trans. Archit. Code Optim.},
  number = 4,
  pages = {1--36},
  publisher = {ACM},
  title = {Virtual Machine Showdown: Stack Versus Registers},
  url = {https://doi.org/10.1145/1328195.1328197},
  volume = 4,
  year = 2008,
  series = {TACO'08},

  vm:concepts = {Interpretation},
  vm:shortTitle = {Stack vs. Register}
}

@inproceedings{Bolz:2009:TMP,
  abstract = {We attempt to apply the technique of Tracing JIT Compilers in the context of the PyPy project, i.e., to programs that are interpreters for some dynamic languages, including Python. Tracing JIT compilers can greatly speed up programs that spend most of their time in loops in which they take similar code paths. However, applying an unmodified tracing JIT to a program that is itself a bytecode interpreter results in very limited or no speedup. In this paper we show how to guide tracing JIT compilers to greatly improve the speed of bytecode interpreters. One crucial point is to unroll the bytecode dispatch loop, based on two kinds of hints provided by the implementer of the bytecode interpreter. We evaluate our technique by applying it to two PyPy interpreters: one is a small example, and the other one is the full Python interpreter.},
  author = {Bolz, Carl Friedrich and Cuni, Antonio and Fijalkowski, Maciej and Rigo, Armin},
  booktitle = {Proceedings of the 4th Workshop on the Implementation, Compilation, Optimization of Object-Oriented Languages and Programming Systems},
  doi = {10.1145/1565824.1565827},
  isbn = {978-1-60558-541-3},
  numpages = {8},
  pages = {18--25},
  publisher = {ACM},
  series = {ICOOOLPS'09},
  title = {Tracing the Meta-level: PyPy's Tracing JIT Compiler},
  year = 2009,
  url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1023.9117&rep=rep1&type=pdf",

  vm:shortTitle = {Meta Tracing Interpreters},
  vm:edge:extends = {
    Bala:2000:DTD + reusable JIT;
    Gal:2009:TJT + reusable JIT}
}

@inproceedings{Gal:2009:TJT,
  author = {Gal, Andreas and Eich, Brendan and Shaver, Mike and Anderson, David and Mandelin, David and Haghighat, Mohammad R. and Kaplan, Blake and Hoare, Graydon and Zbarsky, Boris and Orendorff, Jason and Ruderman, Jesse and Smith, Edwin W. and Reitmaier, Rick and Bebenita, Michael and Chang, Mason and Franz, Michael},
  booktitle = {Proceedings of the 30th ACM SIGPLAN Conference on Programming Language Design and Implementation},
  doi = {10.1145/1542476.1542528},
  isbn = {978-1-60558-392-1},
  numpages = {14},
  pages = {465--478},
  publisher = {ACM},
  series = {PLDI'09},
  title = {Trace-based Just-in-time Type Specialization for Dynamic Languages},
  year = 2009,
  url = "http://dept.cs.williams.edu/~freund/cs434/gal-trace.pdf",

  vm:shortTitle = {TraceMonkey},
  vm:edge:extends = {
    Bala:2000:DTD + high-level language}
}

@inproceedings{Brunthaler:2010:EIU,
  abstract = {Just-in-time compilers offer the biggest achievable payoff performance-wise, but their implementation is a non-trivial, time-consuming task affecting the interpreter's maintenance for years to come, too. Recent research addresses this issue by providing ways of leveraging existing just-in-time compilation infrastructures.
Though there has been considerable research on improving the efficiency of just-in-time compilers, the area of optimizing interpreters has gotten less attention as if the implementation of a dynamic translation system was the "ultima ratio" for efficiently interpreting programming languages. We present optimization techniques for improving the efficiency of interpreters without requiring just-in-time compilation thereby maintaining the ease-of-implementation characteristic that brought many people to implementing an interpreter in the first place.},
  author = {Brunthaler, Stefan},
  booktitle = {Proceedings of the 6th Symposium on Dynamic Languages},
  doi = {10.1145/1899661.1869633},
  issn = {0362-1340},
  month = oct,
  number = 12,
  numpages = {14},
  pages = {1--14},
  publisher = {ACM},
  series = {DLS'10},
  title = {Efficient Interpretation Using Quickening},
  year = 2010,
  url = {https://publications.sba-research.org/publications/dls10.pdf},

  vm:shortTitle = {Bytecode Quickening},
  vm:concepts = {Interpretation}
}

@inproceedings{Wurthinger:2012:SOAST,
  abstract = {An abstract syntax tree (AST) interpreter is a simple and natural way to implement a programming language. However, it is also considered the slowest approach because of the high overhead of virtual method dispatch. Language implementers therefore define bytecodes to speed up interpretation, at the cost of introducing inflexible and hard to maintain bytecode formats. We present a novel approach to implementing AST interpreters in which the AST is modified during interpretation to incorporate type feedback. This tree rewriting is a general and powerful mechanism to optimize many constructs common in dynamic programming languages. Our system is implemented in Java and uses the static typing and primitive data types of Java elegantly to avoid the cost of boxed representations of primitive values in dynamic programming languages.},
  author = {Würthinger, Thomas and Wöß, Andreas and Stadler, Lukas and Duboscq, Gilles and Simon, Doug and Wimmer, Christian},
  booktitle = {Proceedings of the 8th Dynamic Languages Symposium},
  doi = {10.1145/2384577.2384587},
  isbn = {978-1-4503-1564-7},
  month = {October},
  numpages = {10},
  pages = {73--82},
  series = {DLS'12},
  title = {Self-Optimizing AST Interpreters},
  url = {http://www.christianwimmer.at/Publications/Wuerthinger12a/},
  year = 2012,

  vm:shortTitle = {SelfOpt AST Interp.},
  vm:concepts = {Interpretation},
  vm:edge:extends = {Brunthaler:2010:EIU + AST interpreters}
}

@inproceedings{Bolz:2013:SSC,
  author = {Bolz, Carl Friedrich and Diekmann, Lukas and Tratt, Laurence},
  booktitle = {Proceedings of the 2013 ACM SIGPLAN International Conference on Object Oriented Programming Systems Languages \& Applications},
  description = {Storage strategies for collections in dynamically typed languages},
  doi = {10.1145/2509136.2509531},
  isbn = {978-1-4503-2374-1},
  numpages = {16},
  pages = {167--182},
  publisher = {ACM},
  series = {OOPSLA'13},
  title = {Storage Strategies for Collections in Dynamically Typed Languages},
  year = 2013,
  url = "https://tratt.net/laurie/research/pubs/html/bolz_diekmann_tratt__storage_strategies_for_collections_in_dynamically_typed_languages/",

  vm:concepts = {Collections Representation},
  vm:shortTitle = {Storage Strategies for Collections},
  vm:edge:extends = {CollectionsRepresentation + adapt representation dynamically to unboxed values}
}

@inproceedings{Woss:14:TOM,
  abstract = {Truffle is a Java-based framework for developing high-performance language runtimes. Language implementers aiming at developing new runtimes have to design all the runtime mechanisms for managing dynamically typed objects from scratch. This not only leads to potential code duplication, but also impacts the actual time needed to develop a fully-fledged runtime. In this paper we address this issue by introducing a common object storage model (OSM) for Truffle that can be used by language implementers to develop new runtimes. The OSM is generic, language-agnostic, and portable, as it can be used to implement a great variety of dynamic languages. It is extensible, featuring built-in support for custom extension mechanisms. It is also high-performance, as it is designed to benefit from the optimizing compiler in the Truffle framework. Our initial evaluation indicates that the Truffle OSM can be used to implement high-performance language runtimes, with no performance overhead when compared to language-specific solutions.},
  author = {Wöß, Andreas and Wirth, Christian and Bonetta, Daniele and Seaton, Chris and Humer, Christian and Mössenböck, Hanspeter},
  booktitle = {Proceedings of the 2014 International Conference on Principles and Practices of Programming on the Java Platform: Virtual Machines, Languages, and Tools},
  doi = {10.1145/2647508.2647517},
  isbn = {978-1-4503-2926-2},
  numpages = {12},
  pages = {133--144},
  publisher = {ACM},
  series = {PPPJ'14},
  title = {An Object Storage Model for the Truffle Language Implementation Framework},
  year = 2014,
  url = "https://chrisseaton.com/rubytruffle/pppj14-om/pppj14-om.pdf",

  vm:shortTitle = {Truffle Object Model},
  vm:edge:extends = {
    Chambers:89:Self + unboxed values, read-only & type specialization}
}

@inproceedings{Duboscq:2014:SWR,
  abstract = {Speculative optimizations are used in most Just In Time (JIT) compilers in order to take advantage of dynamic runtime feedback. These speculative optimizations usually require the compiler to produce meta-data that the Virtual Machine (VM) can use as fallback when a speculation fails. This meta-data can be large and incurs a significant memory overhead since it needs to be stored alongside the machine code for as long as the machine code lives. The design of the Graal compiler leads to many speculations falling back to a similar state and location. In this paper we present deoptimization grouping, an optimization using this property of the Graal compiler to reduce the amount of meta-data that must be stored by the VM without having to modify the VM. We compare our technique with existing meta-data compression techniques from the HotSpot Virtual Machine and study how well they combine. In order to make informed decisions about speculation meta-data, we present an empirical analysis of the origin, impact and usages of this meta-data.},
  author = {Duboscq, Gilles and W\"{u}rthinger, Thomas and M\"{o}ssenb\"{o}ck, Hanspeter},
  booktitle = {Proceedings of the 2014 International Conference on Principles and Practices of Programming on the Java Platform: Virtual Machines, Languages, and Tools},
  doi = {10.1145/2647508.2647521},
  isbn = {978-1-4503-2926-2},
  numpages = {7},
  pages = {187--193},
  publisher = {ACM},
  series = {PPPJ'14},
  title = {Speculation Without Regret: Reducing Deoptimization Meta-data in the Graal Compiler},
  year = 2014,
  url = {http://www.ssw.uni-linz.ac.at/General/Staff/GD/PPPJ-2014-duboscq-29.pdf},

  vm:shortTitle = {Optimized Deoptimization},
  vm:edge:extends = {Holzle:1992:DOC + optimized metadata}
}

@inproceedings{Marr:15:DC,
  abstract = {Runtime metaprogramming enables many useful applications and is often a convenient solution to solve problems in a generic way, which makes it widely used in frameworks, middleware, and domain-specific languages. However, powerful metaobject protocols are rarely supported and even common concepts such as reflective method invocation or dynamic proxies are not optimized. Solutions proposed in literature either restrict the metaprogramming capabilities or require application or library developers to apply performance improving techniques.
For overhead-free runtime metaprogramming, we demonstrate that dispatch chains, a generalized form of polymorphic inline caches common to self-optimizing interpreters, are a simple optimization at the language-implementation level. Our evaluation with self-optimizing interpreters shows that unrestricted metaobject protocols can be realized for the first time without runtime overhead, and that this optimization is applicable for just-in-time compilation of interpreters based on meta-tracing as well as partial evaluation. In this context, we also demonstrate that optimizing common reflective operations can lead to significant performance improvements for existing applications.},
  appendix = {https://stefan-marr.de/papers/pldi-marr-et-al-zero-overhead-metaprogramming-artifacts/},
  author = {Marr, Stefan and Seaton, Chris and Ducasse, Stéphane},
  blog = {https://stefan-marr.de/2015/04/zero-overhead-metaprogramming/},
  booktitle = {Proceedings of the 36th ACM SIGPLAN Conference on Programming Language Design and Implementation},
  doi = {10.1145/2737924.2737963},
  html = {https://stefan-marr.de/papers/pldi-marr-et-al-zero-overhead-metaprogramming/},
  isbn = {978-1-4503-3468-6},
  month = {June},
  numpages = {10},
  pages = {545--554},
  url = {https://stefan-marr.de/downloads/pldi15-marr-et-al-zero-overhead-metaprogramming.pdf},
  publisher = {ACM},
  series = {PLDI'15},
  title = {Zero-Overhead Metaprogramming: Reflection and Metaobject Protocols Fast and without Compromises},
  year = 2015,

  vm:concepts = {Lookup Caches, Metaobject Protocols},
  vm:shortTitle = {Dispatch Chains (Multidimensional Lookup Caches)},
  vm:edge:uses = {Wurthinger:2012:SOAST},
  vm:edge:extends = {Hoelzle:91:PIC + nested PICs, cache for metaprogramming}
}

@inproceedings{ChevalierBoisvert:15:BBV,
  author = {Chevalier-Boisvert, Maxime and Feeley, Marc},
  booktitle = {29th European Conference on Object-Oriented Programming (ECOOP 2015)},
  doi = {10.4230/LIPIcs.ECOOP.2015.101},
  isbn = {978-3-939897-86-6},
  issn = {1868-8969},
  pages = {101--123},
  publisher = {Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik},
  series = {ECOOP'15},
  title = {Simple and Effective Type Check Removal through Lazy Basic Block Versioning},
  volume = 37,
  year = 2015,
  url = "https://drops.dagstuhl.de/opus/volltexte/2015/5219/pdf/9.pdf",

  vm:shortTitle = {Basic Block Versioning},
  vm:edge:extends = {Compilation basic-block JIT}
}

@inproceedings{Daloze:2015:GLS,
  abstract = {Safepoints are a virtual machine mechanism that allows one thread
to suspend other threads in a known state so that runtime actions
can be performed without interruption and with data structures in a
consistent state. Many virtual machines use safepoints as a mechanism
to provide services such as stop-the-world garbage collection,
debugging, and modification to running code such as installing
or replacing classes. Languages implemented on these virtual machines
may have access to these services, but not directly to the
safepoint mechanism itself. We show that safepoints have many
useful applications for the implementation of guest languages running
on a virtual machine. We describe an API for using safepoints
in languages that were implemented under the Truffle language implementation
framework on the Java Virtual Machine and show
several applications of the API to implement useful guest-language
functionality. We present an efficient implementation of this API,
when running in combination with the Graal dynamic compiler. We
also demonstrate that our safepoints cause zero overhead with respect
to peak performance and statistically insignificant overhead
with respect to compilation time. We compare this to other techniques
that could be used to implement the same functionality and
demonstrate the large overhead that they incur.},
  author = {Daloze, Benoit and Seaton, Chris and Bonetta, Daniele and Mössenböck, Hanspeter},
  booktitle = {Proceedings of the 10th International Workshop on Implementation, Compilation, Optimization of Object-Oriented Languages, Programs and Systems},
  numpages = {10},
  series = {ICOOOLPS'15},
  title = {Techniques and Applications for Guest-Language Safepoints},
  year = 2015,
  url = "https://eregon.me/blog/assets/research/guest-language-safepoints.pdf",

  vm:concepts = {Safepoints},
  vm:shortTitle = {Guest-Language Safepoints}
}

@inproceedings{Pape:2015:LSS,
  abstract = {Storage strategies have been proposed as a run-time optimization for the PyPy Python implementation and have shown promising results for optimizing execution speed and memory requirements. However, it remained unclear whether the approach works equally well in other dynamic languages. Furthermore, while PyPy is based on RPython, a language to write VMs with reusable components such as a tracing just-in-time compiler and garbage collection, the strategies design itself was not generalized to be reusable across languages implemented using that same toolchain. In this paper, we present a general design and implementation for storage strategies and show how they can be reused across different RPython-based languages. We evaluate the performance of our implementation for RSqueak, an RPython-based VM for Squeak/Smalltalk and show that storage strategies may indeed offer performance benefits for certain workloads in other dynamic programming languages.We furthermore evaluate the generality of our implementation by applying it to Topaz, a Ruby VM, and Pycket, a Racket implementation.},
  author = {Pape, Tobias and Felgentreff, Tim and Hirschfeld, Robert and Gulenko, Anton and Bolz, Carl Friedrich},
  booktitle = {Proceedings of the 11th Symposium on Dynamic Languages},
  doi = {10.1145/2816707.2816716},
  isbn = {978-1-4503-3690-1},
  numpages = {10},
  pages = {104--113},
  publisher = {ACM},
  series = {DLS'15},
  title = {Language-independent Storage Strategies for tracing-JIT-based Virtual Machines},
  year = 2015,
  url = "https://www.hpi.uni-potsdam.de/hirschfeld/publications/media/PapeFelgentreffHirschfeldGulenkoBolz_2015_LanguageIndependentStorageStrategiesForTracingJitBasedVirtualMachines_AuthorsVersion.pdf",

  vm:shortTitle = {Language-independent Storage Strategies},
  vm:edge:extends = {Bolz:2013:SSC + language independent}
}

@inproceedings{Marr:2015:MTPE,
  abstract = {Tracing and partial evaluation have been proposed as meta-compilation techniques for interpreters to make just-in-time compilation language-independent. They promise that programs executing on simple interpreters can reach performance of the same order of magnitude as if they would be executed on state-of-the-art virtual machines with highly optimizing just-in-time compilers built for a specific language. Tracing and partial evaluation approach this meta-compilation from two ends of a spectrum, resulting in different sets of tradeoffs.
This study investigates both approaches in the context of self-optimizing interpreters, a technique for building fast abstract-syntax-tree interpreters. Based on RPython for tracing and Truffle for partial evaluation, we assess the two approaches by comparing the impact of various optimizations on the performance of an interpreter for SOM, an object-oriented dynamically-typed language. The goal is to determine whether either approach yields clear performance or engineering benefits. We find that tracing and partial evaluation both reach roughly the same level of performance. SOM based on meta-tracing is on average 3x slower than Java, while SOM based on partial evaluation is on average 2.3x slower than Java. With respect to the engineering, tracing has however significant benefits, because it requires language implementers to apply fewer optimizations to reach the same level of performance.},
  appendix = {http://stefan-marr.de/papers/oopsla-marr-ducasse-meta-tracing-vs-partial-evaluation-artifacts/},
  author = {Marr, Stefan and Ducasse, Stéphane},
  blog = {http://stefan-marr.de/2015/10/tracing-vs-partial-evaluation-comparing-meta-compilation-approaches-for-self-optimizing-interpreters/},
  booktitle = {Proceedings of the 2015 ACM International Conference on Object Oriented Programming Systems Languages \& Applications},
  doi = {10.1145/2814270.2814275},
  html = {http://stefan-marr.de/papers/oopsla-marr-ducasse-meta-tracing-vs-partial-evaluation/},
  isbn = {978-1-4503-2585-1},
  month = {October},
  numpages = {19},
  pages = {821--839},
  url = {https://stefan-marr.de/downloads/oopsla15-marr-ducasse-meta-tracing-vs-partial-evaluation.pdf},
  publisher = {ACM},
  series = {OOPSLA'15},
  title = {Tracing vs. Partial Evaluation: Comparing Meta-Compilation Approaches for Self-Optimizing Interpreters},
  year = 2015,

  vm:shortTitle = {Tracing vs. Partial Evaluation},
  vm:edge:uses = {
    Bolz:2009:TMP + comparison;
    Wurthinger:2017:PPE + comparison
  }
}

@inproceedings{Chevalier-Boisvert:2016:ITS,
  author = {Chevalier-Boisvert, Maxime and Feeley, Marc},
  booktitle = {30th European Conference on Object-Oriented Programming (ECOOP 2016)},
  description = {DROPS - Interprocedural Type Specialization of JavaScript Programs Without Type Analysis},
  doi = {10.4230/LIPIcs.ECOOP.2016.7},
  isbn = {978-3-95977-014-9},
  issn = {1868-8969},
  pages = {7:1--7:24},
  publisher = {Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik},
  series = {ECOOP'16},
  title = {Interprocedural Type Specialization of JavaScript Programs Without Type Analysis},
  volume = 56,
  year = 2016,
  url = "https://drops.dagstuhl.de/opus/volltexte/2016/6101/pdf/LIPIcs-ECOOP-2016-7.pdf",

  vm:shortTitle = {Shapes and Basic Block Versioning},
  vm:edge:extends = {
    Woss:14:TOM + basic block versioning;
    ChevalierBoisvert:15:BBV + type propagation}
}

@inproceedings{Daloze:2016:TSO,
  abstract = {We are in the multi-core era.
Dynamically-typed languages are in widespread use, but their support for
multithreading still lags behind. One of the reasons is that the sophisticated
techniques they use to efficiently represent their dynamic object models are
often unsafe in multithreaded environments.

This paper defines safety requirements for dynamic object models in
multithreaded environments.
Based on these requirements, a language-agnostic and thread-safe object model
is designed that maintains the efficiency of sequential approaches.
This is achieved by ensuring that field reads do not require synchronization
and field updates only need to synchronize on objects shared between threads.

Basing our work on JRuby+Truffle, we show that our safe object model has zero overhead on peak performance for thread-local objects
and only 3\% average overhead on parallel benchmarks where field updates require synchronization.
Thus, it can be a foundation for safe and efficient multithreaded VMs for a wide range of dynamic languages.},
  author = {Daloze, Benoit and Marr, Stefan and Bonetta, Daniele and Mössenböck, Hanspeter},
  booktitle = {Proceedings of the 2016 ACM International Conference on Object Oriented Programming Systems Languages \& Applications},
  day = 2,
  doi = {10.1145/2983990.2984001},
  isbn = {978-1-4503-4444-9},
  month = {November},
  numpages = {18},
  pages = {642--659},
  url = {https://eregon.me/blog/assets/research/thread-safe-objects.pdf},
  publisher = {ACM},
  series = {OOPSLA'16},
  title = {Efficient and Thread-Safe Objects for Dynamically-Typed Languages},
  year = 2016,

  vm:concepts = {VM Implementation Thread Safety},
  vm:shortTitle = {Thread-Safe Object Model},
  vm:edge:extends = {
    Domani:02:TLAB + self-specializing write barrier;
    Woss:14:TOM + thread safety},
  vm:edge:uses = {
    Marr:15:DC for efficient sharing}
}

@inproceedings{Marr:2016:AWFY,
  abstract = {Comparing the performance of programming languages is difficult because they differ in many aspects including preferred programming abstractions, available frameworks, and their runtime systems. Nonetheless, the question about relative performance comes up repeatedly in the research community, industry, and wider audience of enthusiasts.
This paper presents 14 benchmarks and a novel methodology to assess the compiler effectiveness across language implementations. Using a set of common language abstractions, the benchmarks are implemented in Java, JavaScript, Ruby, Crystal, Newspeak, and Smalltalk. We show that the benchmarks exhibit a wide range of characteristics using language-agnostic metrics. Using four different languages on top of the same compiler, we show that the benchmarks perform similarly and therefore allow for a comparison of compiler effectiveness across languages. Based on anecdotes, we argue that these benchmarks help language implementers to identify performance bugs and optimization potential by comparing to other language implementations.},
  appendix = {https://github.com/smarr/are-we-fast-yet#readme},
  author = {Marr, Stefan and Daloze, Benoit and Mössenböck, Hanspeter},
  blog = {http://stefan-marr.de/2016/10/cross-language-compiler-benchmarking-are-we-fast-yet/},
  booktitle = {Proceedings of the 12th Symposium on Dynamic Languages},
  day = 1,
  doi = {10.1145/2989225.2989232},
  html = {http://stefan-marr.de/papers/dls-marr-et-al-cross-language-compiler-benchmarking-are-we-fast-yet/},
  isbn = {978-1-4503-4445-6},
  month = {November},
  numpages = {12},
  pages = {120--131},
  url = {https://stefan-marr.de/downloads/dls16-marr-et-al-cross-language-compiler-benchmarking-are-we-fast-yet.pdf},
  publisher = {ACM},
  series = {DLS'16},
  title = {Cross-Language Compiler Benchmarking---Are We Fast Yet?},
  year = 2016,

  vm:concepts = {Benchmarking},
  vm:shortTitle = {Are We Fast Yet? Benchmarks}
}
@Comment{
  Dacapo and Dacapo Con Scala important influences
}

@inproceedings{Ungar:17:DA,
  author = {Ungar, David and Grove, David and Franke, Hubertus},
  booktitle = {Proceedings of the 13th {ACM} {SIGPLAN} International Symposium on on Dynamic Languages  - {DLS} 2017},
  description = {Dynamic atomicity: optimizing swift memory management | Proceedings of the 13th ACM SIGPLAN International Symposium on on Dynamic Languages},
  doi = {10.1145/3133841.3133843},
  keywords = {MemoryManagement Reachability ReferenceCounting Swift},
  publisher = {{ACM} Press},
  series = {DLS'17},
  title = {Dynamic Atomicity: Optimizing Swift Memory Management},
  url = {https://doi.org/10.1145%2F3133841.3133843},
  year = 2017,

  vm:concepts = {Garbage Collection},
  vm:shortTitle = {Reachability for Dynamic Atomic Reference Counting},
  vm:edge:extends = {Daloze:2016:TSO + eliminate local refcounting}
}

@inproceedings{Wurthinger:2017:PPE,
  abstract = {Most high-performance dynamic language virtual machines duplicate language semantics in the interpreter, compiler, and runtime system. This violates the principle to not repeat yourself. In contrast, we define languages solely by writing an interpreter. The interpreter performs specializations, e.g., augments the interpreted program with type information and profiling information. Compiled code is derived automatically using partial evaluation while incorporating these specializations. This makes partial evaluation practical in the context of dynamic languages: It reduces the size of the compiled code while still compiling all parts of an operation that are relevant for a particular program. When a speculation fails, execution transfers back to the interpreter, the program re-specializes in the interpreter, and later partial evaluation again transforms the new state of the interpreter to compiled code. We evaluate our approach by comparing our implementations of JavaScript, Ruby, and R with best-in-class specialized production implementations. Our general-purpose compilation system is competitive with production systems even when they have been heavily optimized for the one language they support. For our set of benchmarks, our speedup relative to the V8 JavaScript VM is 0.83x, relative to JRuby is 3.8x, and relative to GNU R is 5x.},
  author = {W\"{u}rthinger, Thomas and Wimmer, Christian and Humer, Christian and W\"{o}\ss, Andreas and Stadler, Lukas and Seaton, Chris and Duboscq, Gilles and Simon, Doug and Grimmer, Matthias},
  booktitle = {Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation},
  description = {Practical partial evaluation for high-performance dynamic language runtimes},
  doi = {10.1145/3062341.3062381},
  isbn = {978-1-4503-4988-8},
  numpages = {15},
  pages = {662--676},
  publisher = {ACM},
  series = {PLDI'17},
  title = {Practical Partial Evaluation for High-performance Dynamic Language Runtimes},
  year = 2017,
  url = "https://chrisseaton.com/rubytruffle/pldi17-truffle/pldi17-truffle.pdf",

  vm:shortTitle = {Partially Evaluating Interpreters},
  vm:edge:extends = {
    Wurthinger:2012:SOAST + compilation;
    Chambers:89:Self + method-based PE},
  vm:edge:uses = {Woss:14:TOM;Duboscq:2014:SWR}
}

@article{Daloze:2018:TSC,
  abstract = {Dynamic programming languages such as Python and Ruby are widely used, and much effort is spent on making them efficient. One substantial research effort in this direction is the enabling of parallel code execution. While there has been significant progress, making dynamic collections efficient, scalable, and thread-safe is an open issue. Typical programs in dynamic languages use few but versatile collection types. Such collections are an important ingredient of dynamic environments, but are difficult to make safe, efficient, and scalable.
              In this paper, we propose an approach for efficient and concurrent collections by gradually increasing synchronization levels according to the dynamic needs of each collection instance. Collections reachable only by a single thread have no synchronization, arrays accessed in bounds have minimal synchronization, and for the general case, we adopt the Layout Lock paradigm and extend its design with a lightweight version that fits the setting of dynamic languages. We apply our approach to Ruby’s Array and Hash collections. Our experiments show that our approach has no overhead on single-threaded benchmarks, scales linearly for Array and Hash accesses, achieves the same scalability as Fortran and Java for classic parallel algorithms, and scales better than other Ruby implementations on Ruby workloads.},
  author = {Daloze, Benoit and Tal, Arie and Marr, Stefan and Mössenböck, Hanspeter and Petrank, Erez},
  doi = {10.1145/3276478},
  journal = {Proceedings of the ACM on Programming Languages},
  month = {November},
  number = {OOPSLA},
  pages = {108:1--108:30},
  url = {https://eregon.me/blog/assets/research/thread-safe-collections.pdf},
  series = {OOPSLA'18},
  title = {Parallelization of Dynamic Languages: Synchronizing Built-in Collections},
  volume = 2,
  year = 2018,

  vm:shortTitle = {Synchronize Collections based on Reachability},
  vm:edge:extends = {
    Bolz:2013:SSC + thread safety;
    Domani:02:TLAB + self-specializing write barrier;
    Daloze:2016:TSO + collections},
  vm:edge:uses = {
    Daloze:2015:GLS}
}

@inproceedings{Choi:18:BRC,
  author = {Choi, Jiho and Shull, Thomas and Torrellas, Josep},
  booktitle = {Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques},
  doi = {10.1145/3243176.3243195},
  publisher = {{ACM} Press},
  series = {PACT'18},
  title = {Biased Reference Counting},
  url = {https://iacoma.cs.uiuc.edu/iacoma-papers/pact18.pdf},
  year = 2018,

  vm:concepts = {Garbage Collection},
  vm:shortTitle = {Biased Reference Counting},
  vm:edge:similar = {Ungar:17:DA similar with bias towards a thread}
}

@inproceedings{Ottoni:2018:HJP,
  author = {Ottoni, Guilherme},
  booktitle = {Proceedings of the 39th ACM SIGPLAN Conference on Programming Language Design and Implementation},
  doi = {10.1145/3192366.3192374},
  isbn = {978-1-4503-5698-5},
  numpages = {15},
  pages = {151--165},
  publisher = {ACM},
  series = {PLDI'18},
  title = {HHVM JIT: A Profile-guided, Region-based Compiler for PHP and Hack},
  year = 2018,
  url = "https://research.fb.com/wp-content/uploads/2018/04/hhvm-jit-a-profile-guided-region-based-compiler-for-php-and-hack.pdf",

  vm:shortTitle = {Region-based JIT},
  vm:edge:extends = {Compilation region JIT}
}

@inproceedings{Roberts:2019:TTAF,
  abstract = {Transient gradual typing imposes run-time type tests that typically cause a linear slowdown. This performance impact discourages the use of type annotations because adding types to a program makes the program slower. A virtual machine can employ standard just-in-time optimizations to reduce the overhead of transient checks to near zero. These optimizations can give gradually-typed languages performance comparable to state-of-the-art dynamic languages, so programmers can add types to their code without affecting their programs' performance.},
  appendix = {https://github.com/gracelang/moth-benchmarks/blob/papers/ecoop19/index.md},
  author = {Roberts, Richard and Marr, Stefan and Homer, Michael and Noble, James},
  booktitle = {33rd European Conference on Object-Oriented Programming},
  day = 15,
  doi = {10.4230/LIPIcs.ECOOP.2019.5},
  isbn = {978-3-95977-111-5},
  issn = {1868-8969},
  month = {July},
  number = 5,
  pages = {5:1--5:28},
  url = {https://stefan-marr.de/downloads/ecoop19-roberts-et-al-transient-typechecks-are-almost-free.pdf},
  publisher = {Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik},
  series = {ECOOP'19},
  title = {Transient Typechecks are (Almost) Free},
  volume = 134,
  year = 2019,

  vm:concepts = {Optimization, Gradual Typing},
  vm:shortTitle = {Optimizing Gradual Typing on GraalVM},
  vm:edge:uses = {Woss:14:TOM;Marr:15:DC}
}
@Comment{
 TODO: add the important papers this one is based on
}

@inproceedings{Cheng:2020:TF,
  author = {Cheng, Lin and Ilbeyi, Berkin and Bolz-Tereick, Carl Friedrich and Batten, Christopher},
  booktitle = {Proceedings of the 18th {ACM}/{IEEE} International Symposium on Code Generation and Optimization},
  description = {Type freezing: exploiting attribute type monomorphism in tracing JIT compilers | Proceedings of the 18th ACM/IEEE International Symposium on Code Generation and Optimization},
  doi = {10.1145/3368826.3377907},
  month = feb,
  publisher = {ACM},
  series = {CGO'20},
  title = {Type Freezing: Exploiting Attribute Type Monomorphism in Tracing JIT Compilers},
  url = {https://dl.acm.org/doi/abs/10.1145/3368826.3377907},
  year = 2020,

  vm:shortTitle = {Type Freezing Maps},
  vm:edge:extends = {Woss:14:TOM + nested type freezing}
}

@concept{Execution,
  vm:shortTitle = {Execution Techniques}
}

@concept{Interpretation,
  vm:concepts = {Execution}
}

@concept{Compilation,
  vm:concepts = {Execution}
}