@inproceedings{venkataramani-dac07,
author = {Girish Venkataramani and Tiberiu Chelcea and Mihai Budiu and Seth C. Goldstein},
title = {Critical Path: A Tool for System-Level Timing Analysis},
booktitle = {Design Automation Conference (DAC)},
address = {San Diego, CA},
month = {June 4--8},
year = {2007},
abstract = {An effective method for focusing optimization effort on the most
important parts of a design is to examine those elements on the
critical path. Traditionally, the critical path is defined at the RTL
level, as the longest path in the combinational logic between clocked
registers. In this paper, we present a system-level timing analysis
technique to define the concept of a Global Critical Path (GCP), for
predicting system-level performance. We show how the GCP can be used as
a theoretical and practical tool for understanding, summarizing and
optimizing the behavior of highly concurrent self-timed circuits. We
formally define the GCP and show how it can be constructed using a
discrete event model and hardware profiling techniques. The GCP
provides valuable insight into the control-path behavior of circuits
and in finding system-level bottlenecks. We have incorporated the GCP
construction and analysis framework into a high-level synthesis and
simulation toolchain, thus enabling complete automation in modeling,
analysis and optimization.},
note = {An expanded version is in CMU-CS-06-144},
url = {http://research.microsoft.com/users/mbudiu/dac07.pdf},
acceptancerate = {161/713 = 22\%},
confweb = {http://www.dac.com/44th/index.html}
}
@inproceedings{isard-eurosys07,
author = {Michael Isard and Mihai Budiu and Yuan Yu and Andrew Birrell and Dennis Fetterly},
title = {{Dryad}: Distributed Data-Parallel Programs from Sequential Building Blocks},
booktitle = {European Conference on Computer Systems (EuroSys)},
pages = {59--72},
address = {Lisbon, Portugal},
month = {March 21-23},
year = {2007},
abstract = {Dryad is a general-purpose distributed execution engine for
coarse-grain data-parallel applications. A Dryad application combines
computational ``vertices'' with communication ``channels'' to form a
dataflow graph. Dryad runs the application by executing the vertices of
this graph on a set of available computers, communicating as
appropriate through files, TCP pipes, and shared-memory FIFOs. The
vertices provided by the application developer are quite simple and are
usually written as sequential programs with no thread creation or
locking. Concurrency arises from Dryad scheduling vertices to run
simultaneously on multiple computers, or on multiple CPU cores within a
computer. The application can discover the size and placement of data
at run time, and modify the graph as the computation progresses to make
efficient use of the available resources. Dryad is designed to scale
from powerful multi-core single computers, through small clusters of
computers, to data centers with thousands of computers. The Dryad
execution engine handles all the difficult problems of creating a large
distributed, concurrent application: scheduling the use of computers
and their CPUs, recovering from communication or computer failures, and
transporting data between vertices.},
note = {also as MSR-TR-2006-140},
url = {http://research.microsoft.com/users/mbudiu/eurosys07.pdf},
confweb = {http://www.eurosys.org/2007}
}
@InProceedings{budiu-asid06,
author = {Mihai Budiu and {\'U}lfar Erlingsson and Mart{\'\i}n Abadi},
title = {Architectural Support for Software-Based Protection},
booktitle = {Workshop on Architectural and System Support for Improving Software
Dependability (ASID)},
pages = {42--51},
address = {San Jose, CA},
month = {October 21},
year = {2006},
abstract = {Control-Flow Integrity (CFI) is a property that guarantees program
control flow cannot be subverted by a malicious adversary, even if the
adversary has complete control of data memory. We have shown in prior
work how CFI can be enforced by using inlined software guards that
perform safety checks. The first part of this paper shows how modest
Instruction Set Architecture (ISA) support can replace such guard code
with single instructions. \par On the foundation of CFI we have
implemented XFI: a protection system that offers fine-grained memory
access control and fundamental integrity guarantees for critical system
state. XFI can be seen as a flexible, generalized form of
software-based fault isolation (SFI). In the second part of this paper
we present ISA support for XFI, in the form of simple bounds-check
instructions. \par CFI and XFI can significantly increase the security
and integrity of software execution. Our results indicate that support
for CFI and XFI is a straightforward, simple addition to hardware
architectures. Compared to software guards, such hardware support
increases the efficiency and simplicity of enforcement.},
note = {Also as MSR-TR-2006-115},
url = {http://research.microsoft.com/users/mbudiu/asid06.pdf},
confweb = {http://opera.cs.uiuc.edu/ASID}
}
@TechReport{venkataramani-tr06,
author = {Girish Venkataramani and Tiberiu Chelcea and Mihai Budiu and Seth C. Goldstein},
title = {Modeling the Global Critical Path in Concurrent Systems},
institution = {Carnegie Mellon University, Computer Science Department},
number = {CMU-CS-06-144},
pages = {22},
month = {August},
year = {2006},
abstract = {We show how the global critical path can be used as a practical tool
for understanding, optimizing and summarizing the behavior of highly
concurrent self-timed circuits. Traditionally, critical path analysis
has been applied to DAGs, and thus is constrained to combinatorial
sub-circuits. We formally define the global critical path (GCP) and
show how it can be constructed using only local information that is
automatically derived directly from the circuit. We introduce a form of
Production Rules, which can accurately determine the GCP for a given
input vector even for modules which exhibit choice. \par The GCP
provides valuable insight into the behavior of circuits and, more
generally, into the control behavior of the application class. These
insights help in formulating new optimizations and re-formulating
existing ones to use the GCP knowledge. We have incorporated our
framework into a high-level synthesis tool-chain, which automatically
computes GCP, and utilizes the GCP for large scale circuits. We
demonstrate the effectiveness of the GCP framework by re-formulating
two traditional CAD optimizations to use the GCPyielding efficient
algorithms which improve circuit power (by up to 9\%) and performance
(by up to 60\%) in our experiments.},
url = {http://reports-archive.adm.cs.cmu.edu/anon/2006/abstracts/06-144.html}
}
@InProceedings{erlingsson-osdi06,
author = {{\'{U}}lfar Erlingsson and Mart{\'\i}n Abadi and Michael Vrable and Mihai Budiu and George C. Necula},
title = {{XFI}: Software Guards for System Address Spaces},
booktitle = {Symposium on Operating System Design and Implementation (OSDI)},
pages = {75--88},
address = {Seattle, WA},
month = {November 6-8},
year = {2006},
abstract = {XFI is a comprehensive protection system that offers both flexible
access control and fundamental integrity guarantees, at any privilege
level and even for legacy code in commodity systems. For this purpose,
XFI combines static analysis with inline software guards and a
two-stack execution model. We have implemented XFI forWindows on the
x86 architecture using binary rewriting and a simple, stand-alone
verifier; the implementation¢s correctness depends on the verifier, but
not on the rewriter. We have applied XFI to software such as device
drivers and multimedia codecs. The resulting modules function safely
within both kernel and user-mode address spaces, with only modest
enforcement overheads},
url = {http://research.microsoft.com/users/mbudiu/osdi06.pdf},
confweb = {http://www.usenix.org/events/osdi06},
acceptancerate = {27/150=18\%}
}
@InProceedings{mishra-asplos06,
author = {Mahim Mishra and Timothy J. Callahan and Tiberiu Chelcea and Girish Venkataramani and Mihai Budiu and Seth C. Goldstein},
title = {{Tartan}: Evaluating Spatial Computation For Whole Program Execution},
booktitle = {International Conference on Architectural Support for Programming
Languages and Operating Systems (ASPLOS)},
pages = {163--174},
address = {San Jose, CA},
month = {October 21-25},
year = {2006},
abstract = {Spatial Computing (SC) has been shown to be an energy-efficient model
for implementing program kernels. In this paper we explore the
feasibility of using SC for more than small kernels. To this end, we
evaluate the performance and energy efficiency of entire applications
on Tartan, a general-purpose architecture which integrates a
reconfigurable fabric (RF) with a superscalar core. Our compiler
automatically partitions and compiles an application into an
instruction stream for the core and a configuration for the RF. We use
a detailed simulator to capture both timing and energy numbers for all
parts of the system. Our results indicate that a hierarchical RF
architecture, designed around a scalable interconnect, is instrumental
in harnessing the benefits of spatial computation. The interconnect
uses static configuration and routing at the lower levels and a
packet-switched, dynamically-routed network at the top level. Tartan is
most energyefficient when almost all of the application is mapped to
the RF, indicating the need for the RF to support most general-purpose
programming constructs. Our initial investigation reveals that such a
system can provide, on average, an order of magnitude improvement in
energy-delay compared to an aggressive superscalar core on
single-threaded workloads.},
url = {http://research.microsoft.com/users/mbudiu/asplos06.pdf},
confweb = {http://www.princeton.edu/~asplos06/},
acceptancerate = {38/158=24\%}
}
@InProceedings{budiu-ispass05,
author = {Mihai Budiu and Pedro V. Artigas and Seth Copen Goldstein},
title = {Dataflow: A Complement to Superscalar},
booktitle = {IEEE International Symposium on Performance Analysis of Systems and
Software (ISPASS)},
pages = {177--186},
address = {Austin, TX},
month = {March 20-22},
year = {2005},
abstract = {There has been a resurgence of interest in dataflow architectures,
because of their potential for exploiting parallelism with low
overhead. In this paper we analyze the performance of a class of static
dataflow machines on integer media and control-intensive programs and
we explain why a dataflow machine, even with unlimited resources, does
not always outperform a superscalar processor on general-purpose codes,
under the assumption that both machines take the same time to execute
basic operations. We compare a program-specific dataflow machine with
unlimited parallelism to a superscalar processor running the same
program. While the dataflow machines provide very good performance on
most data-parallel programs, we show that the dataflow machine cannot
always take advantage of the available parallelism. Using the dynamic
critical path we investigate the mechanisms used by superscalar
processors to provide a performance advantage and their impact on a
dataflow model.},
url = {http://www.cs.cmu.edu/~mihaib/research/ispass05.pdf},
confweb = {http://www.ispass.org/ispass2005},
acceptancerate = {27/92 = 29\%}
}
@InProceedings{budiu-odes05,
author = {Mihai Budiu and Seth Copen Goldstein},
title = {Inter-Iteration Scalar Replacement in the Presence of Conditional Control-Flow},
booktitle = {Workshop on Optimizations for {DSP} and Embedded Systems (ODES)},
pages = {20--29},
address = {San Jose, CA},
month = {March 20},
year = {2005},
abstract = {A large class of multimedia programs for embedded systems manipulate
data represented as dense matrices. In this paper we revisit the
classical optimization of scalar replacement of array elements and
pointer accesses; this optimization allocates array elements to
registers, reducing memory traffic. We generalize the state-of-the-art
algorithm, by Carr and Kennedy~\cite{carr-spe94}, improving it to
handle simultaneously both conditional control-flow and inter-iteration
data reuse. Our algorithm operates within the same assumptions of the
classical one (perfect dependence information), and has the same
limitations (increased register pressure). It is, however, optimal in
the sense that within each code region where scalar promotion is
applied, given sufficient registers, each memory location is
read/written at most once.},
url = {http://www.cs.cmu.edu/~mihaib/research/odes05.pdf},
confweb = {http://www.ece.vill.edu/~deepu/odes/odes-3_program.html}
}
@inproceedings{abadi-icfem05,
author = {Mart{\'\i}n Abadi and Mihai Budiu and {\'U}lfar Erlingsson and Jay Ligatti},
title = {A Theory of Secure Control-Flow},
booktitle = {International Conference on Formal Engineering Methods (ICFEM)},
pages = {111-124},
address = {Manchester, UK},
month = {November 1-4},
year = {2005},
abstract = {Control-Flow Integrity (CFI) means that the execution of a program
dynamically follows only certain paths, in accordance with a static
policy. CFI can prevent attacks that, by exploiting buffer overflows
and other vulnerabilities, attempt to control program behavior. This
paper develops the basic theory that underlies two practical techniques
for CFI enforcement, with precise formulations of hypotheses and
guarantees.},
url = {http://research.microsoft.com/users/mbudiu/icfem05.pdf},
confweb = {http://www.cs.man.ac.uk/icfem05}
}
@inproceedings{abadi-ccs05,
author = {Mart{\'\i}n Abadi and Mihai Budiu and {\'U}lfar Erlingsson and Jay Ligatti},
title = {Control-Flow Integrity},
booktitle = {ACM Conference on Computer and Communication Security (CCS)},
pages = {340-353},
address = {Alexandria, VA},
month = {November 7-11},
year = {2005},
abstract = {Current software attacks often build on exploits that subvert
machine-code execution. The enforcement of a basic safety property,
Control-Flow Integrity (CFI), can prevent such attacks from arbitrarily
controlling program behavior. CFI enforcement is simple, and its
guarantees can be established formally, even with respect to powerful
adversaries. Moreover, CFI enforcement is practical: it is compatible
with existing software and can be efficiently implemented using
software rewriting in commodity systems. Finally, CFI provides a useful
foundation for enforcing further security policies, such as policies
that constrain the use of data memory.},
url = {http://research.microsoft.com/users/mbudiu/ccs05.pdf},
confweb = {http://www.acm.org/sigs/sigsac/ccs/CCS2005},
acceptancerate = {38/250=25\%}
}
@InProceedings{budiu-asplos04,
author = {Mihai Budiu and Girish Venkataramani and Tiberiu Chelcea and Seth Copen Goldstein},
title = {Spatial Computation},
booktitle = {International Conference on Architectural Support for Programming
Languages and Operating Systems (ASPLOS)},
pages = {14--26},
address = {Boston, MA},
month = {October 9-13},
year = {2004},
abstract = {This paper describes a computer architecture that relies on the direct
translation of high-level language programs into {\em Spatial
Computation} (SC) hardware structures. SC program implementations are
completely distributed, without any centralized control. SC circuits
are optimized for {\em wires} at the expense of computation units. \par
In this paper we investigate a particular implementation SC structures
called ASH (Application-Specific Hardware). Under the assumption that
computation is cheaper than communication, ASH replicates computation
units to simplify interconnect, building a system which uses very
simple, completely dedicated communication channels. As a consequence,
communication on the datapath never requires arbitration; the only
arbitration required is for accessing memory. ASH relies on very simple
hardware primitives, using no associative structures, no multiported
register files, no scheduling logic, no broadcast, and no clocks. As a
consequence, ASH hardware is fast and extremely power efficient. \par
In this work we demonstrate three features of ASH: (1) that such
architectures can be built by automatic compilation of C programs, (2)
that distributed computation is in some respects fundamentally
different from monolithic superscalar processors and (3) that ASIC
implementations of ASH use 3 orders of magnitude less energy compared
to high-end superscalar processors, while being within a factor of two
in performance.},
url = {http://www.cs.cmu.edu/~mihaib/research/asplos04.pdf},
doi = {http://doi.acm.org/10.1145/1024393.1024396},
confweb = {http://www.eecg.toronto.edu/asplos2004},
acceptancerate = {24/169 = 14\%}
}
@InProceedings{koes-msp04,
author = {David Koes and Mihai Budiu and Girish Venkataramani and Seth Copen Goldstein},
title = {Programmer Specified Pointer Independence},
booktitle = {Workshop on Memory System Performance (MSP)},
month = {June},
year = {2004},
abstract = {Good alias analysis is essential in order to achieve high performance
on modern processors, yet interprocedural analysis does not scale well.
We present a source code annotation, {\tt \#pragma independent}, which
is a more flexible, intuitive and useful way for the programmer to
provide pointer aliasing information than the current C99 {\tt
restrict} keyword. We describe a tool which highlights the most
important and most likely correct locations at which a programmer can
insert the pragmas. We show that such annotations can be used
effectively in compilers to achieve speedups of up to 1.2x.},
note = {Also as TR CMU-CS-03-123},
url = {http://www.cs.cmu.edu/~mihaib/research/msp04.pdf},
confweb = {http://cs.anu.edu.au/~Steve.Blackburn/msp2004}
}
@InProceedings{venkataramani-iwls04,
author = {Girish Venkataramani and Mihai Budiu and Seth Copen Goldstein},
title = {{C} to Asynchronous Dataflow Circuits: An End-to-End Toolflow},
booktitle = {International Workshop on Logic synthesis (IWLS)},
pages = {501--508},
address = {Temecula, CA},
month = {June},
year = {2004},
abstract = {We present a complete toolflow that translates ANSI-C programs into
asynchronous circuits. The toolflow is built around a compiler that
converts C into a functional dataflow intermediate representation,
exposing instruction-level, pipeline and memory parallelism. The
compiler performs optimizations and converts the intermediate
representation into pipelined asynchronous circuits, with no
centralized controllers. In the resulting circuits, control is
distributed, communication is achieved through local wires, and
arbitration for datapath resources is unnecessary. Circuits
automatically synthesized from Mediabench kernels exhibit substantially
better energy-delay than either single-issue processors or aggressive
superscalar cores.},
note = {(full paper)},
url = {http://www.cs.cmu.edu/~mihaib/research/iwls04.pdf},
confweb = {http://www.iwls.org/}
}
@InProceedings{budiu-cgo03,
author = {Mihai Budiu and Seth Copen Goldstein},
title = {Optimizing Memory Accesses For Spatial Computation},
booktitle = {International ACM/IEEE Symposium on Code Generation and Optimization
(CGO)},
pages = {216-227},
address = {San Francisco, CA},
month = {March 23--26},
year = {2003},
abstract = {In this paper we present the internal representation and optimizations
used by the CASH compiler for improving the memory parallelism of
pointer-based programs. CASH uses an SSA-based representation for
memory, which compactly summarizes both control-flow and dependence
information. In CASH, memory optimization is a four-step process: (1)
first an initial, relatively coarse, representation of memory
dependences is built; (2) next, unnecessary memory dependences are
removed using dependence tests; (3) third, redundant memory operations
are removed (4) finally, parallelism is increased by pipelining memory
accesses in loops. While the first three steps above are very general,
the loop pipelining transformations are particularly applicable for
spatial computation, which is the primary target of CASH. The redundant
memory removal optimizations presented are: load/store hoisting
(subsuming partial redundancy elimination and common-subexpression
elimination), load-after-store removal, store-before-store removal
(dead store removal) and loop-invariant load motion. One of our loop
pipelining transformations is a new form of loop parallelization,
called loop decoupling. This transformation separates independent
memory accesses within a loop body into several independent loops,
which are allowed dynamically to slip with respect to each other. A new
computational primitive, a token generator is used to dynamically
control the amount of slip, allowing maximum freedom, while
guaranteeing that no memory dependences are violated.},
url = {http://www-2.cs.cmu.edu/~mihaib/research/cgo03.pdf},
url2 = {http://csdl.computer.org/comp/proceedings/cgo/2003/1913/00/19130216abs.htm},
confweb = {http://www.cgo.org/cgo2003}
}
@InProceedings{goldstein-asap03,
author = {Seth Goldstein and Mihai Budiu and Mahim Mishra and Girish Venkataramani},
title = {Reconfigurable Computing and Electronic Nanotechnology},
booktitle = {IEEE International Conference on Application-specific Systems,
Architectures and Processors},
pages = {132--143},
address = {Hague, the Netherlands},
month = {June 24-26},
year = {2003},
abstract = {In this paper we examine the opportunities brought about by recent
progress in electronic nanotechnology and describe the methods needed
to harness them for building a new computer architecture. In this
process we decompose some traditional abstractions, such as the
transistor, into fine-grain pieces, such as signal restoration and
input-output isolation. We also show how we can forgo the extreme
reliability of CMOS circuits for low-cost chemical self-assembly at the
expense of large manufacturing defect densities. We discuss advanced
testing methods which can be used to recover perfect functionality from
unreliable parts. We proceed to show how the molecular switch, the
regularity of the circuits created by self-assembly and the high defect
densities logically require the use of reconfigurable hardware as a
basic building block for hardware design. We then capitalize on the
convergence of compilation and hardware synthesis (which takes place
when programming reconfigurable hardware) to propose the complete
elimination of the instruction-set architecture from the system
architecture, and the synthesis of asynchronous dataflow machines
directly from high-level programming languages, such as C. We discuss
in some detail a scalable compilation system that perform this task.},
note = {Invited paper},
url = {http://www.cs.cmu.edu/~mihaib/research/asap03.pdf},
url2 = {http://csdl.computer.org/comp/proceedings/asap/2003/1992/00/19920132abs.htm},
confweb = {http://www-ece.rice.edu/asap2003/program03.html}
}
@PhdThesis{budiu-phd03,
key = {PhD Thesis 03},
author = {Mihai Budiu},
title = {Spatial Computation},
school = {Carnegie Mellon University, Computer Science Department},
number = {CMU-CS-03-217},
pages = {225},
month = {December},
year = {2003},
abstract = {This thesis presents a compilation framework for translating ANSI C
programs into hardware dataflow machines. The framework is embodied in
the CASH compiler, a Compiler for Application-Specific Hardware. CASH
generates asynchronous hardware circuits that directly implement the
functionality of the source program, without using any interpretative
structures. This style of computation is dubbed ``Spatial
Computation''. CASH relies extensively on predication and speculation
for building efficient hardware circuits. \par The first part of this
document describes Pegasus, the internal representation of CASH, and a
series of novel program transformations performed by CASH, the most
notable of which are a new optimal register-promotion algorithm and
partial redundancy elimination for memory accesses based on predicate
manipulation. \par The second part of this document evaluates the
performance of the generated circuits using simulation. Using media
processing benchmarks, we show that for the domain of embedded
computation, the circuits generated by CASH can sustain high levels of
instruction level parallelism, due to the effective use of dataflow
software pipelining. A comparison of Spatial Computation and
superscalar processors highlights some of the weaknesses of our model
of computation, such as the lack of branch prediction and register
renaming. \par The results presented in this document can be applied in
several domains: (1) most of the compiler optimizations are applicable
to traditional compilers for high-level languages (2) CASH itself can
be used as a hardware synthesis tool for very fast system-on-a-chip
prototyping directly from C sources, (3) the compilation framework we
describe can be applied to the translation of imperative languages to
dataflow machines, (4) we have extended the dataflow machine model to
encompass predication, data-speculation and control-speculation, and
(5) the tool-chain described and some specific optimizations, such as
lenient execution, and pipeline balancing, can be used for synthesis
and optimization of asynchronous hardware.},
note = {Technical report CMU-CS-03-217},
url = {http://www.cs.cmu.edu/~mihaib/research/thesis.pdf},
url2 = {http://reports-archive.adm.cs.cmu.edu/anon/2003/abstracts/03-217.html}
}
@InBook{goldstein-chapter03,
key = {Chapter 03},
author = {Seth Copen Goldstein and Mihai Budiu},
editor = {Mark A. Reed and Takhee Lee},
title = {Molecules, Gates, Circuits, Computers},
chapter = {in Molecular Nanoelectronics},
pages = {327--388},
publisher = {American Scientific Publishers},
month = {January},
year = {2003},
url = {http://aspbs.com/molecularnano.html}
}
@InProceedings{budiu-fpl02,
author = {Mihai Budiu and Seth Copen Goldstein},
title = {Compiling Application-Specific Hardware},
booktitle = {International Conference on Field Programmable Logic and Applications
(FPL)},
pages = {853--863},
address = {Montpellier (La Grande-Motte), France},
month = {September 2--4},
year = {2002},
abstract = {In this paper we describe ASH, an architectural framework for
implementing Application-Specific Hardware. ASH is based on automatic
hardware synthesis from high-level languages. The generated circuits
use only localized computation structures; in consequence, we expect
these circuits to be fast, to use little power and to scale well with
program complexity. \par We present in detail CASH, a scalable compiler
framework for ASH, which generates hardware from programs written in C.
Our compiler exploits instruction level parallelism by using aggressive
speculation and dynamic scheduling. Based on this compilation scheme,
we evaluate the computational resources necessary for implementing
complex integer-based programs, and we suggest architectural features
that would support the ASH framework.},
url = {http://www.cs.cmu.edu/~mihaib/research/fpl02-budiu.pdf},
url2 = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2438&spage=853},
confweb = {http://www.lirmm.fr/fpl2002/home.html}
}
@InProceedings{venkataramani-fpl02,
key = {FPL 02a},
author = {Girish Venkataramani and Suraj Sudhir and Mihai Budiu and Seth Copen Goldstein},
title = {Factors Influencing the Performance of a CPU-RFU Hybrid Architecture},
booktitle = {International Conference on Field Programmable Logic and Applications
(FPL)},
pages = {955--965},
address = {Montpellier (La Grande-Motte), France},
month = {September},
year = {2002},
abstract = {Closely coupling a reconfigurable fabric with a conventional processor
has been shown to successfully improve the system performance. However,
today s superscalar pro-cessors are both complex and adept at
extracting Instruction Level Parallelism (ILP), which introduces many
complex issues to the design of a hybrid CPU-RFU system. This paper
examines the design of a superscalar processor augmented with a
closely-coupled recon-figurable fabric. It identifies architectural and
compiler issues that affect the performance of the overall system.
Previous efforts at combining a processor core with a reconfigurable
fabric are examined in the light of these issues. We also present
simulation results that emphasize the impact of these factors.},
url = {http://www.cs.cmu.edu/~mihaib/research/fpl02-girish.pdf},
url2 = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2438&spage=955},
confweb = {http://www.lirmm.fr/fpl2002/home.html}
}
@InProceedings{budiu-fccm02,
author = {Mihai Budiu and Mahim Mishra and Ashwin Bharambe and Seth Copen Goldstein},
title = {Peer-to-peer Hardware-Software Interfaces for Reconfigurable Fabrics},
booktitle = {IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM)},
pages = {57--66},
address = {Napa Valley, CA},
month = {April},
year = {2002},
abstract = {In this paper we describe a peer-to-peer interface between processor
cores and reconfigurable fabrics. The main advantage of the
peer-to-peer model is that it greatly expands the scope of application
for reconfigurable computing and hence its potential benefits. The
primary extension in our model is that ``code'' on the reconfigurable
hardware unit is allowed to invoke routines both on the reconfigurable
unit itself and on the fixed logic processor. We describe the software
constructs and compilation mechanisms needed for such an architecture,
including a detailed description of the interface between the two parts
of the application.},
url = {http://www.cs.cmu.edu/~mihaib/research/fccm02-peer.pdf},
url2 = {http://csdl.computer.org/comp/proceedings/fccm/2002/1801/00/18010057abs.htm},
confweb = {http://www.fccm.org/preprog02.html}
}
@InProceedings{goldstein-isca01,
author = {Seth Copen Goldstein and Mihai Budiu},
title = {{NanoFabrics}: Spatial Computing Using Molecular Electronics},
booktitle = {International Symposium on Computer Architecture (ISCA)},
pages = {178--189},
address = {G\"{o}teborg, Sweden},
year = {2001},
abstract = {The continuation of the remarkable exponential increases in processing
power over the recent past faces imminent challenges due in part to the
physics of deep-submicron CMOS devices and the costs of both chip masks
and future fabrication plants. A promising solution to these problems
is offered by an alternative to CMOS-based computing, chemically
assembled electronic nanotechnology (CAEN). In this paper we outline
how CAEN based computing can become a reality. We briefly describe
recent work in CAEN and how CAEN will affect computer architecture. We
show how the inherently reconfigurable natures of CAEN devices can be
exploited to provide high-density chips with defect tolerance which
will significantly reduce the cost of manufacturing. After developing
the basic building blocks of a CAEN based computing devices we present
some preliminary results which indicate that CAEN based computing
devices can meet or exceed the performance of CMOS based devices.},
url = {http://www.cs.cmu.edu/~seth/papers/isca01.pdf},
confweb = {http://www.ce.chalmers.se/conf2001},
doi = {http://doi.acm.org/10.1145/379240.379262}
}
@InProceedings{budiu-europar00,
key = {EuroPar 00},
author = {Mihai Budiu and Majd Sakr and Kip Walker and Seth Copen Goldstein},
title = {{BitValue} Inference: Detecting and Exploiting Narrow Bitwidth Computations},
booktitle = {European Conference on Parallel Processing (EUROPAR)},
series = {Lecture Notes in Computer Science},
volume = {1900},
pages = {969--979},
publisher = {Springer Verlag},
address = {M\"{u}nich, Germany},
year = {2000},
abstract = {We present a compiler algorithm called BitValue, which can discover
both unused and constant bits in dusty-deck C programs. BitValue uses
forward and backward dataflow analyses, generalizing constant-folding
and dead-code detection at the bit-level. This algorithm enables
compiler optimizations which target special processor architectures for
computing on non-standard bitwidths. Using this algorithm we show that
up to 31\% of the computed bytes are thrown away (for programs from
SpecINT95 and Mediabench). A compiler for reconfigurable hardware uses
this algorithm to achieve substantial reductions (up to 20-fold) in the
size of the synthesized circuits.},
note = {An expanded version is in budiu-tr00},
url = {http://www.cs.cmu.edu/~mihaib/research/europar00.pdf},
url2 = {http://link.springer.de/link/service/series/0558/papers/1900/19000969.pdf},
confweb = {http://www.informatik.uni-trier.de/GI/FG-014/Announce/2000/EuroPar2000.CFP.html}
}
@Article{goldstein-ieee00,
key = {Computer 00},
author = {Seth Copen Goldstein and Herman Schmit and Mihai Budiu and Srihari Cadambi and Matt Moe and Reed Taylor},
title = {{PipeRench}: A Reconfigurable Architecture and Compiler},
journal = {IEEE Computer},
volume = {33},
number = {4},
pages = {70--77},
month = {April},
year = {2000},
abstract = {With the proliferation of highly specialized embedded computer systems
has come a diversification of workloads for computing devices.
General-purpose processors are struggling to efficiently meet these
applications' disparate needs, and custom hardware is rarely feasible.
According to the authors, reconfigurable computing, which combines the
flexibility of general-purpose processors with the efficiency of custom
hardware, can provide the alternative. PipeRench and its associated
compiler comprise the authors' new architecture for reconfigurable
computing. Combined with a traditional digital signal processor,
microcontroller or general-purpose processor, PipeRench can support a
system's various computing needs without requiring custom hardware. The
authors describe the PipeRench architecture and how it solves some of
the pre-existing problems with FPGA architectures, such as logic
granularity, configuration time, forward compatibility, hard
constraints and compilation time.},
url = {http://www.cs.cmu.edu/~mihaib/research/computer.pdf},
url2 = {http://ieeexplore.ieee.org/iel5/2/18132/00839324.pdf}
}
@Article{birman-tocs99,
author = {Kenneth P. Birman and Mark Hayden and Oznur Oskasap and Zhen Xiao and Mihai Budiu and Yaron Minsky},
title = {Bimodal Multicast},
journal = {Transactions on Computer Systems (TOCS)},
volume = {17},
number = {2},
pages = {41--88},
month = may,
year = {1999},
abstract = {There are many methods for making a multicast protocol ``reliable''.
At one end of thespectrum, a reliable multicast protocol might offer
atomicity guarantees, such as all-ornothing delivery, delivery
ordering, and perhaps additional properties such as
virtuallysynchronous addressing. At the other are protocols that use
local repair to overcome transient packet loss in the network, offering
``best effort'' reliability. Yet none of this priorwork has treated
stability of multicast delivery as a basic reliability property, such
as might be needed in an internet radio, TV, or conferencing
application. This paper looks atreliability with a new goal:
development of a multicast protocol which is reliable in a sense that
can be rigorously quantified and includes throughput stability
guarantees. We characterize this new protocol as a ``bimodal
multicast'' in reference to its reliability model, which corresponds to
a family of bimodal probability distributions. Here, we introduce
theprotocol, provide a theoretical analysis of its behavior, review
experimental results, and discuss some candidate applications. These
confirm that bimodal multicast is reliable,scalable, and that the
protocol provides remarkably stable delivery throughput.},
url = {http://www.cs.cmu.edu/~mihaib/research/pbcast.pdf},
doi = {http://doi.acm.org/10.1145/312203.312207},
url2 = {http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p41-birman}
}
@InProceedings{goldstein-isca99,
author = {Seth Copen Goldstein and Herman Schmit and Matthew Moe and Mihai Budiu and Srihari Cadambi and R. Reed Taylor and Ronald Laufer},
title = {{PipeRench}: a Coprocessor for Streaming Multimedia Acceleration},
booktitle = {International Symposium on Computer Architecture (ISCA)},
pages = {28--39},
address = {Atlanta, GA},
year = {1999},
abstract = {Future computing workloads will emphasize an architecture's ability to
perform relatively simple calculations on massive quantities of
mixed-width data. This paper describes a novel reconfigurable fabric
architecture, PipeRench, optimized to accelerate these types of
computations. PipeRench enables fast, robust compilers, supports
forward compatibility, and virtualizes configurations, thus removing
the fixed size constraint present in other fabrics. For the first time
we explore how the bit-width of processing elements affects performance
and show how the PipeRench architecture has been optimized to balance
the needs of the compiler against the realities of silicon. Finally, we
demonstrate extreme performance speedup on certain computing kernels
(up to 190x versus a modern RISC processor), and analyze how this
acceleration translates to application speedup.},
url = {http://www.cs.cmu.edu/~mihaib/research/isca99.pdf},
confweb = {http://www.informatik.uni-trier.de/~ley/db/conf/isca/isca99.html},
doi = {http://doi.acm.org/10.1145/300979.300982}
}
@InProceedings{budiu-fpga99,
author = {Mihai Budiu and Seth Copen Goldstein},
title = {Fast Compilation for Pipelined Reconfigurable Fabrics},
booktitle = {ACM/SIGDA International Symposium on Field Programmable Gate Arrays
(FPGA)},
pages = {195--205},
address = {Monterey, CA},
year = {1999},
abstract = {In this paper we describe a compiler which quickly synthesizes high
quality pipelined datapaths for pipelined reconfigurable devices. The
compiler uses the same internal representation to perform synthesis,
module generation, optimization, and place and route. The core of the
compiler is a linear time place and route algorithm more than two
orders of magnitude faster than traditional CAD tools. The key behind
our approach is that we never backtrack, rip-up, or re-route. Instead,
the graph representing the computation is preprocessed to guarantee
routability by inserting lazy noops. The preprocessing steps provides
enough information to make a greedy strategy feasible. The compilation
speed is approximately 3000 bit-operations/second (on a PII/400Mhz) for
a wide range of applications. The hardware utilization averages 60\% on
the target device, PipeRench.},
url = {http://www.cs.cmu.edu/~mihaib/research/fpga99.pdf},
confweb = {http://portal.acm.org/toc.cfm?id=296399&dl=portal&dl=ACM&type=proceeding&idx=SERIES100&part=Proceedings&WantType=Proceedings&title=International%20Symposium%20on%20Field%20Programmable%20Gate%20Arrays},
doi = {http://doi.acm.org/10.1145/296399.296459}
}