diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..0233495f --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-07-22T00:00:00Z":{"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2407.12736v2","updated":"2024-07-22T05:19:15Z","published":"2024-07-17T16:56:06Z","title":"CHOSEN: Compilation to Hardware Optimization Stack for Efficient Vision\n Transformer Inference","summary":" Vision Transformers (ViTs) represent a groundbreaking shift in machine\nlearning approaches to computer vision. Unlike traditional approaches, ViTs\nemploy the self-attention mechanism, which has been widely used in natural\nlanguage processing, to analyze image patches. Despite their advantages in\nmodeling visual tasks, deploying ViTs on hardware platforms, notably\nField-Programmable Gate Arrays (FPGAs), introduces considerable challenges.\nThese challenges stem primarily from the non-linear calculations and high\ncomputational and memory demands of ViTs. This paper introduces CHOSEN, a\nsoftware-hardware co-design framework to address these challenges and offer an\nautomated framework for ViT deployment on the FPGAs in order to maximize\nperformance. Our framework is built upon three fundamental contributions:\nmulti-kernel design to maximize the bandwidth, mainly targeting benefits of\nmulti DDR memory banks, approximate non-linear functions that exhibit minimal\naccuracy degradation, and efficient use of available logic blocks on the FPGA,\nand efficient compiler to maximize the performance and memory-efficiency of the\ncomputing kernels by presenting a novel algorithm for design space exploration\nto find optimal hardware configuration that achieves optimal throughput and\nlatency. Compared to the state-of-the-art ViT accelerators, CHOSEN achieves a\n1.5x and 1.42x improvement in the throughput on the DeiT-S and DeiT-B models.\n","authors":["Mohammad Erfan Sadeghi","Arash Fayyazi","Suhas Somashekar","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2407.12736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08192v2","updated":"2024-07-22T05:26:19Z","published":"2024-07-11T05:22:04Z","title":"ARCO:Adaptive Multi-Agent Reinforcement Learning-Based Hardware/Software\n Co-Optimization Compiler for Improved Performance in DNN Accelerator Design","summary":" This paper presents ARCO, an adaptive Multi-Agent Reinforcement Learning\n(MARL)-based co-optimizing compilation framework designed to enhance the\nefficiency of mapping machine learning (ML) models - such as Deep Neural\nNetworks (DNNs) - onto diverse hardware platforms. The framework incorporates\nthree specialized actor-critic agents within MARL, each dedicated to a distinct\naspect of compilation/optimization at an abstract level: one agent focuses on\nhardware, while two agents focus on software optimizations. This integration\nresults in a collaborative hardware/software co-optimization strategy that\nimproves the precision and speed of DNN deployments. Concentrating on\nhigh-confidence configurations simplifies the search space and delivers\nsuperior performance compared to current optimization methods. The ARCO\nframework surpasses existing leading frameworks, achieving a throughput\nincrease of up to 37.95% while reducing the optimization time by up to 42.2%\nacross various DNNs.\n","authors":["Arya Fayyazi","Mehdi Kamal","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2407.08192v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2405.10170v2","updated":"2024-07-22T17:21:24Z","published":"2024-05-16T15:07:56Z","title":"A Mess of Memory System Benchmarking, Simulation and Application\n Profiling","summary":" The Memory stress (Mess) framework provides a unified view of the memory\nsystem benchmarking, simulation and application profiling. The Mess benchmark\nprovides a holistic and detailed memory system characterization. It is based on\nhundreds of measurements that are represented as a family of bandwidth--latency\ncurves. The benchmark increases the coverage of all the previous tools and\nleads to new findings in the behavior of the actual and simulated memory\nsystems. We deploy the Mess benchmark to characterize Intel, AMD, IBM, Fujitsu,\nAmazon and NVIDIA servers with DDR4, DDR5, HBM2 and HBM2E memory. The Mess\nanalytical memory simulator uses bandwidth--latency concept for the memory\nperformance simulation. We integrate Mess with widely-used CPUs simulators\nenabling modeling of all high-end memory technologies. The Mess simulator is\nfast, easy to integrate and it closely matches the actual system performance.\nBy design, it enables a quick adoption of new memory technologies in hardware\nsimulators. Finally, the Mess application profiling positions the application\nin the bandwidth--latency space of the target memory system. This information\ncan be correlated with other application runtime activities and the source\ncode, leading to a better overall understanding of the application's behavior.\nThe current Mess benchmark release covers all major CPU and GPU ISAs, x86, ARM,\nPower, RISC-V, and NVIDIA's PTX. We also release as open source the ZSim, gem5\nand OpenPiton Metro-MPI integrated with the Mess simulator for DDR4, DDR5,\nOptane, HBM2, HBM2E and CXL memory expanders. The Mess application profiling is\nalready integrated into a suite of production HPC performance analysis tools.\n","authors":["Pouya Esmaili-Dokht","Francesco Sgherzi","Valeria Soldera Girelli","Isaac Boixaderas","Mariana Carmin","Alireza Monemi","Adria Armejach","Estanislao Mercadal","German Llort","Petar Radojkovic","Miquel Moreto","Judit Gimenez","Xavier Martorell","Eduard Ayguade","Jesus Labarta","Emanuele Confalonieri","Rishabh Dubey","Jason Adlard"],"pdf_url":"https://arxiv.org/pdf/2405.10170v2.pdf","comment":"17 pages; just accepted in MICRO-57"},{"id":"http://arxiv.org/abs/2407.15440v1","updated":"2024-07-22T07:42:57Z","published":"2024-07-22T07:42:57Z","title":"The Bicameral Cache: a split cache for vector architectures","summary":" The Bicameral Cache is a cache organization proposal for a vector\narchitecture that segregates data according to their access type,\ndistinguishing scalar from vector references. Its aim is to avoid both types of\nreferences from interfering in each other's data locality, with a special focus\non prioritizing the performance on vector references. The proposed system\nincorporates an additional, non-polluting prefetching mechanism to help\npopulate the long vector cache lines in advance to increase the hit rate by\nfurther exploiting the spatial locality on vector data. Its evaluation was\nconducted on the Cavatools simulator, comparing the performance to a standard\nconventional cache, over different typical vector benchmarks for several vector\nlengths. The results proved the proposed cache speeds up performance on\nstride-1 vector benchmarks, while hardly impacting non-stride-1's. In addition,\nthe prefetching feature consistently provided an additional value.\n","authors":["Susana Rebolledo Ruiz","Borja Perez","Jose Luis Bosque","Peter Hsu"],"pdf_url":"https://arxiv.org/pdf/2407.15440v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.15353v1","updated":"2024-07-22T03:44:27Z","published":"2024-07-22T03:44:27Z","title":"Customized Retrieval Augmented Generation and Benchmarking for EDA Tool\n Documentation QA","summary":" Retrieval augmented generation (RAG) enhances the accuracy and reliability of\ngenerative AI models by sourcing factual information from external databases,\nwhich is extensively employed in document-grounded question-answering (QA)\ntasks. Off-the-shelf RAG flows are well pretrained on general-purpose\ndocuments, yet they encounter significant challenges when being applied to\nknowledge-intensive vertical domains, such as electronic design automation\n(EDA). This paper addresses such issue by proposing a customized RAG framework\nalong with three domain-specific techniques for EDA tool documentation QA,\nincluding a contrastive learning scheme for text embedding model fine-tuning, a\nreranker distilled from proprietary LLM, and a generative LLM fine-tuned with\nhigh-quality domain corpus. Furthermore, we have developed and released a\ndocumentation QA evaluation benchmark, ORD-QA, for OpenROAD, an advanced\nRTL-to-GDSII design platform. Experimental results demonstrate that our\nproposed RAG flow and techniques have achieved superior performance on ORD-QA\nas well as on a commercial tool, compared with state-of-the-arts. The ORD-QA\nbenchmark and the training dataset for our customized RAG flow are open-source\nat https://github.com/lesliepy99/RAG-EDA.\n","authors":["Yuan Pu","Zhuolun He","Tairu Qiu","Haoyuan Wu","Bei Yu"],"pdf_url":"https://arxiv.org/pdf/2407.15353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16038v1","updated":"2024-07-22T20:29:56Z","published":"2024-07-22T20:29:56Z","title":"MINT: Securely Mitigating Rowhammer with a Minimalist In-DRAM Tracker","summary":" This paper investigates secure low-cost in-DRAM trackers for mitigating\nRowhammer (RH). In-DRAM solutions have the advantage that they can solve the RH\nproblem within the DRAM chip, without relying on other parts of the system.\nHowever, in-DRAM mitigation suffers from two key challenges: First, the\nmitigations are synchronized with refresh, which means we cannot mitigate at\narbitrary times. Second, the SRAM area available for aggressor tracking is\nseverely limited, to only a few bytes. Existing low-cost in-DRAM trackers (such\nas TRR) have been broken by well-crafted access patterns, whereas prior\ncounter-based schemes require impractical overheads of hundreds or thousands of\nentries per bank. The goal of our paper is to develop an ultra low-cost secure\nin-DRAM tracker.\n Our solution is based on a simple observation: if only one row can be\nmitigated at refresh, then we should ideally need to track only one row. We\npropose a Minimalist In-DRAM Tracker (MINT), which provides secure mitigation\nwith just a single entry. At each refresh, MINT probabilistically decides which\nactivation in the upcoming interval will be selected for mitigation at the next\nrefresh. MINT provides guaranteed protection against classic single and\ndouble-sided attacks. We also derive the minimum RH threshold (MinTRH)\ntolerated by MINT across all patterns. MINT has a MinTRH of 1482 which can be\nlowered to 356 with RFM. The MinTRH of MINT is lower than a prior counter-based\ndesign with 677 entries per bank, and is within 2x of the MinTRH of an\nidealized design that stores one-counter-per-row. We also analyze the impact of\nrefresh postponement on the MinTRH of low-cost in-DRAM trackers, and propose an\nefficient solution to make such trackers compatible with refresh postponement.\n","authors":["Moinuddin Qureshi","Salman Qazi","Aamer Jaleel"],"pdf_url":"https://arxiv.org/pdf/2407.16038v1.pdf","comment":"13 pages including appendix"},{"id":"http://arxiv.org/abs/2407.16026v1","updated":"2024-07-22T20:07:21Z","published":"2024-07-22T20:07:21Z","title":"KWT-Tiny: RISC-V Accelerated, Embedded Keyword Spotting Transformer","summary":" This paper explores the adaptation of Transformerbased models for edge\ndevices through the quantisation and hardware acceleration of the ARM Keyword\nTransformer (KWT) model on a RISC-V platform. The model was targeted to run on\n64kB RAM in bare-metal C using a custom-developed edge AI library. KWT-1 was\nretrained to be 369 times smaller, with only a 10% loss in accuracy through\nreducing output classes from 35 to 2. The retraining and quantisation reduced\nmodel size from 2.42 MB to 1.65 kB. The integration of custom RISC-V\ninstructions that accelerated GELU and SoftMax operations enabled a 5x speedup\nand thus ~5x power reduction in inference, with inference clock cycle counts\ndecreasing from 26 million to 5.5 million clock cycles while incurring a small\narea overhead of approximately 29%. The results demonstrate a viable method for\nporting and accelerating Transformer-based models in low-power IoT devices.\n","authors":["Aness Al-Qawlaq","Ajay Kumar M","Deepu John"],"pdf_url":"https://arxiv.org/pdf/2407.16026v1.pdf","comment":"6 pages, 7 figures, accepted to be published in the IEEE SOCC 2024\n conference"},{"id":"http://arxiv.org/abs/2407.16006v1","updated":"2024-07-22T19:20:14Z","published":"2024-07-22T19:20:14Z","title":"ImPress: Securing DRAM Against Data-Disturbance Errors via Implicit\n Row-Press Mitigation","summary":" DRAM cells are susceptible to Data-Disturbance Errors (DDE), which can be\nexploited by an attacker to compromise system security. Rowhammer is a\nwell-known DDE vulnerability that occurs when a row is repeatedly activated.\nRowhammer can be mitigated by tracking aggressor rows inside DRAM (in-DRAM) or\nat the Memory Controller (MC). Row-Press (RP) is a new DDE vulnerability that\noccurs when a row is kept open for a long time. RP significantly reduces the\nnumber of activations required to induce an error, thus breaking existing RH\nsolutions. Prior work on Explicit Row-Press mitigation, ExPress, requires the\nmemory controller to limit the maximum row-open-time, and redesign existing\nRowhammer solutions with reduced Rowhammer threshold. Unfortunately, ExPress\nincurs significant performance and storage overheads, and being a memory\ncontroller-based solution, it is incompatible with in-DRAM trackers. In this\npaper, we propose Implicit Row-Press mitigation (ImPress), which does not\nrestrict row-open-time, is compatible with memory controller-based and in-DRAM\nsolutions and does not reduce the tolerated Rowhammer threshold. ImPress treats\na row open for a specified time as equivalent to an activation. We design\nImPress by developing a Unified Charge-Loss Model, which combines the net\neffect of both Rowhammer and Row-Press for arbitrary patterns. We analyze both\ncontroller-based (Graphene and PARA) and in-DRAM trackers (Mithril and MINT).\nWe show that ImPress makes Rowhammer solutions resilient to Row-Press\ntransparently, without affecting the Rowhammer threshold.\n","authors":["Moinuddin Qureshi","Anish Saxena","Aamer Jaleel"],"pdf_url":"https://arxiv.org/pdf/2407.16006v1.pdf","comment":"12 page paper"},{"id":"http://arxiv.org/abs/2407.18272v1","updated":"2024-07-22T20:32:16Z","published":"2024-07-22T20:32:16Z","title":"AICircuit: A Multi-Level Dataset and Benchmark for AI-Driven Analog\n Integrated Circuit Design","summary":" Analog and radio-frequency circuit design requires extensive exploration of\nboth circuit topology and parameters to meet specific design criteria like\npower consumption and bandwidth. Designers must review state-of-the-art\ntopology configurations in the literature and sweep various circuit parameters\nwithin each configuration. This design process is highly specialized and\ntime-intensive, particularly as the number of circuit parameters increases and\nthe circuit becomes more complex. Prior research has explored the potential of\nmachine learning to enhance circuit design procedures. However, these studies\nprimarily focus on simple circuits, overlooking the more practical and complex\nanalog and radio-frequency systems. A major obstacle for bearing the power of\nmachine learning in circuit design is the availability of a generic and diverse\ndataset, along with robust metrics, which are essential for thoroughly\nevaluating and improving machine learning algorithms in the analog and\nradio-frequency circuit domain. We present AICircuit, a comprehensive\nmulti-level dataset and benchmark for developing and evaluating ML algorithms\nin analog and radio-frequency circuit design. AICircuit comprises seven\ncommonly used basic circuits and two complex wireless transceiver systems\ncomposed of multiple circuit blocks, encompassing a wide array of design\nscenarios encountered in real-world applications. We extensively evaluate\nvarious ML algorithms on the dataset, revealing the potential of ML algorithms\nin learning the mapping from the design specifications to the desired circuit\nparameters.\n","authors":["Asal Mehradfar","Xuzhe Zhao","Yue Niu","Sara Babakniya","Mahdi Alesheikh","Hamidreza Aghasi","Salman Avestimehr"],"pdf_url":"https://arxiv.org/pdf/2407.18272v1.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2407.06391v3","updated":"2024-07-22T11:27:45Z","published":"2024-07-08T21:04:15Z","title":"Around Classical and Intuitionistic Linear Processes","summary":" Curry-Howard correspondences between Linear Logic (LL) and session types\nprovide a firm foundation for concurrent processes. As the correspondences hold\nfor intuitionistic and classic versions of LL (ILL and CLL), we obtain two\ndifferent families of type systems for concurrency. An open question remains:\nhow do these two families exactly relate to each other? Based upon a\ntranslation from CLL to ILL due to Laurent (2018), we provide two complementary\nanswers, in the form of full abstraction results based on a typed observational\nequivalence due to Atkey (2017). Our results elucidate hitherto missing formal\nlinks between seemingly related yet different type systems for concurrency.\n","authors":["Juan C. Jaramillo","Dan Frumin","Jorge A. Pérez"],"pdf_url":"https://arxiv.org/pdf/2407.06391v3.pdf","comment":"Full version, 19 pages + appendices"},{"id":"http://arxiv.org/abs/2212.11055v5","updated":"2024-07-22T10:02:57Z","published":"2022-12-21T14:59:23Z","title":"Coalgebraic Satisfiability Checking for Arithmetic $μ$-Calculi","summary":" The coalgebraic $\\mu$-calculus provides a generic semantic framework for\nfixpoint logics over systems whose branching type goes beyond the standard\nrelational setup, e.g. probabilistic, weighted, or game-based. Previous work on\nthe coalgebraic $\\mu$-calculus includes an exponential-time upper bound on\nsatisfiability checking, which however relies on the availability of tableau\nrules for the next-step modalities that are sufficiently well-behaved in a\nformally defined sense; in particular, rule matches need to be representable by\npolynomial-sized codes, and the sequent duals of the rules need to absorb cut.\nWhile such rule sets have been identified for some important cases, they are\nnot known to exist in all cases of interest, in particular ones involving\neither integer weights as in the graded $\\mu$-calculus, or real-valued weights\nin combination with non-linear arithmetic. In the present work, we prove the\nsame upper complexity bound under more general assumptions, specifically\nregarding the complexity of the (much simpler) satisfiability problem for the\nunderlying one-step logic, roughly described as the nesting-free next-step\nfragment of the logic. The bound is realized by a generic global caching\nalgorithm that supports on-the-fly satisfiability checking. Notably, our\napproach directly accommodates unguarded formulae, and thus avoids use of the\nguardedness transformation. Example applications include new exponential-time\nupper bounds for satisfiability checking in an extension of the graded\n$\\mu$-calculus with polynomial inequalities (including positive Presburger\narithmetic), as well as an extension of the (two-valued) probabilistic\n$\\mu$-calculus with polynomial inequalities.\n","authors":["Daniel Hausmann","Lutz Schröder"],"pdf_url":"https://arxiv.org/pdf/2212.11055v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15510v1","updated":"2024-07-22T09:49:46Z","published":"2024-07-22T09:49:46Z","title":"Algebraic anti-unification","summary":" Abstraction is key to human and artificial intelligence as it allows one to\nsee common structure in otherwise distinct objects or situations and as such it\nis a key element for generality in AI. Anti-unification (or generalization) is\n\\textit{the} part of theoretical computer science and AI studying abstraction.\nIt has been successfully applied to various AI-related problems, most\nimportantly inductive logic programming. Up to this date, anti-unification is\nstudied only from a syntactic perspective in the literature. The purpose of\nthis paper is to initiate an algebraic (i.e. semantic) theory of\nanti-unification within general algebras. This is motivated by recent\napplications to similarity and analogical proportions.\n","authors":["Christian Antić"],"pdf_url":"https://arxiv.org/pdf/2407.15510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16022v1","updated":"2024-07-22T19:57:19Z","published":"2024-07-22T19:57:19Z","title":"Color Refinement for Relational Structures","summary":" Color Refinement, also known as Naive Vertex Classification, is a classical\nmethod to distinguish graphs by iteratively computing a coloring of their\nvertices. While it is mainly used as an imperfect way to test for isomorphism,\nthe algorithm permeated many other, seemingly unrelated, areas of computer\nscience. The method is algorithmically simple, and it has a well-understood\ndistinguishing power: It is logically characterized by Cai, F\\\"urer and\nImmerman (1992), who showed that it distinguishes precisely those graphs that\ncan be distinguished by a sentence of first-order logic with counting\nquantifiers and only two variables. A combinatorial characterization is given\nby Dvo\\v{r}\\'ak (2010), who shows that it distinguishes precisely those graphs\nthat can be distinguished by the number of homomorphisms from some tree.\n In this paper, we introduce Relational Color Refinement (RCR, for short), a\ngeneralization of the Color Refinement method from graphs to arbitrary\nrelational structures, whose distinguishing power admits the equivalent\ncombinatorial and logical characterizations as Color Refinement has on graphs:\nWe show that RCR distinguishes precisely those structures that can be\ndistinguished by the number of homomorphisms from an acyclic relational\nstructure. Further, we show that RCR distinguishes precisely those structures\nthat can be distinguished by a sentence of the guarded fragment of first-order\nlogic with counting quantifiers.\n","authors":["Benjamin Scheidt","Nicole Schweikardt"],"pdf_url":"https://arxiv.org/pdf/2407.16022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2006.16039v6","updated":"2024-07-22T09:24:17Z","published":"2020-06-29T13:33:18Z","title":"Game Comonads & Generalised Quantifiers","summary":" Game comonads, introduced by Abramsky, Dawar and Wang and developed by\nAbramsky and Shah, give an interesting categorical semantics to some\nSpoiler-Duplicator games that are common in finite model theory. In particular\nthey expose connections between one-sided and two-sided games, and parameters\nsuch as treewidth and treedepth and corresponding notions of decomposition. In\nthe present paper, we expand the realm of game comonads to logics with\ngeneralised quantifiers. In particular, we introduce a comonad graded by two\nparameters $n \\leq k$ such that isomorphisms in the resulting Kleisli category\nare exactly Duplicator winning strategies in Hella's $n$-bijection game with\n$k$ pebbles. We define a one-sided version of this game which allows us to\nprovide a categorical semantics for a number of logics with generalised\nquantifiers. We also give a novel notion of tree decomposition that emerges\nfrom the construction.\n","authors":["Adam Ó Conghaile","Anuj Dawar"],"pdf_url":"https://arxiv.org/pdf/2006.16039v6.pdf","comment":null}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2407.15805v1","updated":"2024-07-22T17:18:26Z","published":"2024-07-22T17:18:26Z","title":"A simple and fast C++ thread pool implementation capable of running task\n graphs","summary":" In this paper, the author presents a simple and fast C++ thread pool\nimplementation capable of running task graphs. The implementation is publicly\navailable on GitHub, see https://github.com/dpuyda/scheduling.\n","authors":["Dmytro Puyda"],"pdf_url":"https://arxiv.org/pdf/2407.15805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15738v1","updated":"2024-07-22T15:41:23Z","published":"2024-07-22T15:41:23Z","title":"Parallel Split Learning with Global Sampling","summary":" The expansion of IoT devices and the demands of Deep Learning have\nhighlighted significant challenges in Distributed Deep Learning (DDL) systems.\nParallel Split Learning (PSL) has emerged as a promising derivative of Split\nLearning that is well suited for distributed learning on resource-constrained\ndevices. However, PSL faces several obstacles, such as large effective batch\nsizes, non-IID data distributions, and the straggler effect. We view these\nissues as a sampling dilemma and propose to address them by orchestrating the\nmini-batch sampling process on the server side. We introduce the Uniform Global\nSampling (UGS) method to decouple the effective batch size from the number of\nclients and reduce mini-batch deviation in non-IID settings. To address the\nstraggler effect, we introduce the Latent Dirichlet Sampling (LDS) method,\nwhich generalizes UGS to balance the trade-off between batch deviation and\ntraining time. Our simulations reveal that our proposed methods enhance model\naccuracy by up to 34.1% in non-IID settings and reduce the training time in the\npresence of stragglers by up to 62%. In particular, LDS effectively mitigates\nthe straggler effect without compromising model accuracy or adding significant\ncomputational overhead compared to UGS. Our results demonstrate the potential\nof our methods as a promising solution for DDL in real applications.\n","authors":["Mohammad Kohankhaki","Ahmad Ayad","Mahdi Barhoush","Anke Schmeink"],"pdf_url":"https://arxiv.org/pdf/2407.15738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15716v1","updated":"2024-07-22T15:22:07Z","published":"2024-07-22T15:22:07Z","title":"CrashEventLLM: Predicting System Crashes with Large Language Models","summary":" As the dependence on computer systems expands across various domains,\nfocusing on personal, industrial, and large-scale applications, there arises a\ncompelling need to enhance their reliability to sustain business operations\nseamlessly and ensure optimal user satisfaction. System logs generated by these\ndevices serve as valuable repositories of historical trends and past failures.\nThe use of machine learning techniques for failure prediction has become\ncommonplace, enabling the extraction of insights from past data to anticipate\nfuture behavior patterns. Recently, large language models have demonstrated\nremarkable capabilities in tasks including summarization, reasoning, and event\nprediction. Therefore, in this paper, we endeavor to investigate the potential\nof large language models in predicting system failures, leveraging insights\nlearned from past failure behavior to inform reasoning and decision-making\nprocesses effectively. Our approach involves leveraging data from the Intel\nComputing Improvement Program (ICIP) system crash logs to identify significant\nevents and develop CrashEventLLM. This model, built upon a large language model\nframework, serves as our foundation for crash event prediction. Specifically,\nour model utilizes historical data to forecast future crash events, informed by\nexpert annotations. Additionally, it goes beyond mere prediction, offering\ninsights into potential causes for each crash event. This work provides the\npreliminary insights into prompt-based large language models for the log-based\nevent prediction task.\n","authors":["Priyanka Mudgal","Bijan Arbab","Swaathi Sampath Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.15716v1.pdf","comment":"Accepted in ICITCOM'24. Copyrights will be with IEEE"},{"id":"http://arxiv.org/abs/2407.15309v1","updated":"2024-07-22T14:37:58Z","published":"2024-07-22T14:37:58Z","title":"vTensor: Flexible Virtual Tensor Management for Efficient LLM Serving","summary":" Large Language Models (LLMs) are widely used across various domains,\nprocessing millions of daily requests. This surge in demand poses significant\nchallenges in optimizing throughput and latency while keeping costs manageable.\nThe Key-Value (KV) cache, a standard method for retaining previous\ncomputations, makes LLM inference highly bounded by memory. While batching\nstrategies can enhance performance, they frequently lead to significant memory\nfragmentation. Even though cutting-edge systems like vLLM mitigate KV cache\nfragmentation using paged Attention mechanisms, they still suffer from\ninefficient memory and computational operations due to the tightly coupled page\nmanagement and computation kernels.\n This study introduces the vTensor, an innovative tensor structure for LLM\ninference based on GPU virtual memory management (VMM). vTensor addresses\nexisting limitations by decoupling computation from memory defragmentation and\noffering dynamic extensibility. Our framework employs a CPU-GPU heterogeneous\napproach, ensuring efficient, fragmentation-free memory management while\naccommodating various computation kernels across different LLM architectures.\nExperimental results indicate that vTensor achieves an average speedup of 1.86x\nacross different models, with up to 2.42x in multi-turn chat scenarios.\nAdditionally, vTensor provides average speedups of 2.12x and 3.15x in kernel\nevaluation, reaching up to 3.92x and 3.27x compared to SGLang Triton\nprefix-prefilling kernels and vLLM paged Attention kernel, respectively.\nFurthermore, it frees approximately 71.25% (57GB) of memory on the NVIDIA A100\nGPU compared to vLLM, enabling more memory-intensive workloads.\n","authors":["Jiale Xu","Rui Zhang","Cong Guo","Weiming Hu","Zihan Liu","Feiyang Wu","Yu Feng","Shixuan Sun","Changxu Shao","Yuhong Guo","Junping Zhao","Ke Zhang","Minyi Guo","Jingwen Leng"],"pdf_url":"https://arxiv.org/pdf/2407.15309v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.15567v1","updated":"2024-07-22T11:52:58Z","published":"2024-07-22T11:52:58Z","title":"A New Theoretical Perspective on Data Heterogeneity in Federated\n Optimization","summary":" In federated learning (FL), data heterogeneity is the main reason that\nexisting theoretical analyses are pessimistic about the convergence rate. In\nparticular, for many FL algorithms, the convergence rate grows dramatically\nwhen the number of local updates becomes large, especially when the product of\nthe gradient divergence and local Lipschitz constant is large. However,\nempirical studies can show that more local updates can improve the convergence\nrate even when these two parameters are large, which is inconsistent with the\ntheoretical findings. This paper aims to bridge this gap between theoretical\nunderstanding and practical performance by providing a theoretical analysis\nfrom a new perspective on data heterogeneity. In particular, we propose a new\nand weaker assumption compared to the local Lipschitz gradient assumption,\nnamed the heterogeneity-driven pseudo-Lipschitz assumption. We show that this\nand the gradient divergence assumptions can jointly characterize the effect of\ndata heterogeneity. By deriving a convergence upper bound for FedAvg and its\nextensions, we show that, compared to the existing works, local Lipschitz\nconstant is replaced by the much smaller heterogeneity-driven pseudo-Lipschitz\nconstant and the corresponding convergence upper bound can be significantly\nreduced for the same number of local updates, although its order stays the\nsame. In addition, when the local objective function is quadratic, more\ninsights on the impact of data heterogeneity can be obtained using the\nheterogeneity-driven pseudo-Lipschitz constant. For example, we can identify a\nregion where FedAvg can outperform mini-batch SGD even when the gradient\ndivergence can be arbitrarily large. Our findings are validated using\nexperiments.\n","authors":["Jiayi Wang","Shiqiang Wang","Rong-Rong Chen","Mingyue Ji"],"pdf_url":"https://arxiv.org/pdf/2407.15567v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2404.14527v4","updated":"2024-07-22T10:56:19Z","published":"2024-04-22T18:56:18Z","title":"Mélange: Cost Efficient Large Language Model Serving by Exploiting GPU\n Heterogeneity","summary":" Large language models (LLMs) are increasingly integrated into many online\nservices, yet they remain cost-prohibitive to deploy due to the requirement of\nexpensive GPU instances. Prior work has addressed the high cost of LLM serving\nby improving the inference engine, but less attention has been given to\nselecting the most cost-efficient GPU type(s) for a specific LLM service. There\nis a large and growing landscape of GPU types and, within these options, higher\ncost does not always lead to increased performance. Instead, through a\ncomprehensive investigation, we find that three key LLM service characteristics\n(request size, request rate, SLO) strongly influence GPU cost efficiency, and\ndiffering GPU types are most cost efficient for differing LLM service settings.\nAs a result, the most cost-efficient allocation for a given service is\ntypically a mix of heterogeneous GPU types. Based on this analysis, we\nintroduce M\\'elange, a GPU allocation framework that navigates these diverse\nLLM service characteristics and heterogeneous GPU option space to automatically\nand efficiently derive the minimal-cost GPU allocation for a given LLM service.\nWe formulate the GPU allocation task as a cost-aware bin packing problem where\nGPUs are bins and items are slices of the service workload. Our formulation's\nconstraints account for a service's unique characteristics, allowing M\\'elange\nto be flexible to support diverse service settings and heterogeneity-aware to\nadapt the GPU allocation to a specific service. Compared to using only a single\nGPU type, M\\'elange reduces deployment costs by up to 77% in conversational\nsettings, 33% in document-based settings, and 51% in a mixed setting.\n","authors":["Tyler Griggs","Xiaoxuan Liu","Jiaxiang Yu","Doyoung Kim","Wei-Lin Chiang","Alvin Cheung","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2404.14527v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06368v3","updated":"2024-07-22T10:21:49Z","published":"2024-05-10T10:10:37Z","title":"DP-DyLoRA: Fine-Tuning Transformer-Based Models On-Device under\n Differentially Private Federated Learning using Dynamic Low-Rank Adaptation","summary":" Federated learning (FL) allows clients to collaboratively train a global\nmodel without sharing their local data with a server. However, clients'\ncontributions to the server can still leak sensitive information. Differential\nprivacy (DP) addresses such leakage by providing formal privacy guarantees,\nwith mechanisms that add randomness to the clients' contributions. The\nrandomness makes it infeasible to train large transformer-based models, common\nin modern federated learning systems. In this work, we empirically evaluate the\npracticality of fine-tuning large scale on-device transformer-based models with\ndifferential privacy in a federated learning system. We conduct comprehensive\nexperiments on various system properties for tasks spanning a multitude of\ndomains: speech recognition, computer vision (CV) and natural language\nunderstanding (NLU). Our results show that full fine-tuning under\ndifferentially private federated learning (DP-FL) generally leads to huge\nperformance degradation which can be alleviated by reducing the dimensionality\nof contributions through parameter-efficient fine-tuning (PEFT). Our benchmarks\nof existing DP-PEFT methods show that DP-Low-Rank Adaptation (DP-LoRA)\nconsistently outperforms other methods. An even more promising approach,\nDyLoRA, which makes the low rank variable, when naively combined with FL would\nstraightforwardly break differential privacy. We therefore propose an\nadaptation method that can be combined with differential privacy and call it\nDP-DyLoRA. Finally, we are able to reduce the accuracy degradation and word\nerror rate (WER) increase due to DP to less than 2% and 7% respectively with 1\nmillion clients and a stringent privacy budget of $\\epsilon=2$.\n","authors":["Jie Xu","Karthikeyan Saravanan","Rogier van Dalen","Haaris Mehmood","David Tuckey","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2405.06368v3.pdf","comment":"16 pages, 10 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.15464v1","updated":"2024-07-22T08:24:45Z","published":"2024-07-22T08:24:45Z","title":"The Diversity Bonus: Learning from Dissimilar Distributed Clients in\n Personalized Federated Learning","summary":" Personalized Federated Learning (PFL) is a commonly used framework that\nallows clients to collaboratively train their personalized models. PFL is\nparticularly useful for handling situations where data from different clients\nare not independent and identically distributed (non-IID). Previous research in\nPFL implicitly assumes that clients can gain more benefits from those with\nsimilar data distributions. Correspondingly, methods such as personalized\nweight aggregation are developed to assign higher weights to similar clients\nduring training. We pose a question: can a client benefit from other clients\nwith dissimilar data distributions and if so, how? This question is\nparticularly relevant in scenarios with a high degree of non-IID, where clients\nhave widely different data distributions, and learning from only similar\nclients will lose knowledge from many other clients. We note that when dealing\nwith clients with similar data distributions, methods such as personalized\nweight aggregation tend to enforce their models to be close in the parameter\nspace. It is reasonable to conjecture that a client can benefit from dissimilar\nclients if we allow their models to depart from each other. Based on this idea,\nwe propose DiversiFed which allows each client to learn from clients with\ndiversified data distribution in personalized federated learning. DiversiFed\npushes personalized models of clients with dissimilar data distributions apart\nin the parameter space while pulling together those with similar distributions.\nIn addition, to achieve the above effect without using prior knowledge of data\ndistribution, we design a loss function that leverages the model similarity to\ndetermine the degree of attraction and repulsion between any two models.\nExperiments on several datasets show that DiversiFed can benefit from\ndissimilar clients and thus outperform the state-of-the-art methods.\n","authors":["Xinghao Wu","Xuefeng Liu","Jianwei Niu","Guogang Zhu","Shaojie Tang","Xiaotian Li","Jiannong Cao"],"pdf_url":"https://arxiv.org/pdf/2407.15464v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.15870v3","updated":"2024-07-22T08:18:38Z","published":"2023-07-29T02:35:37Z","title":"SemiSFL: Split Federated Learning on Unlabeled and Non-IID Data","summary":" Federated Learning (FL) has emerged to allow multiple clients to\ncollaboratively train machine learning models on their private data at the\nnetwork edge. However, training and deploying large-scale models on\nresource-constrained devices is challenging. Fortunately, Split Federated\nLearning (SFL) offers a feasible solution by alleviating the computation and/or\ncommunication burden on clients. However, existing SFL works often assume\nsufficient labeled data on clients, which is usually impractical. Besides, data\nnon-IIDness poses another challenge to ensure efficient model training. To our\nbest knowledge, the above two issues have not been simultaneously addressed in\nSFL. Herein, we propose a novel Semi-supervised SFL system, termed SemiSFL,\nwhich incorporates clustering regularization to perform SFL with unlabeled and\nnon-IID client data. Moreover, our theoretical and experimental investigations\ninto model convergence reveal that the inconsistent training processes on\nlabeled and unlabeled data have an influence on the effectiveness of clustering\nregularization. To mitigate the training inconsistency, we develop an algorithm\nfor dynamically adjusting the global updating frequency, so as to improve\ntraining performance. Extensive experiments on benchmark models and datasets\nshow that our system provides a 3.8x speed-up in training time, reduces the\ncommunication cost by about 70.3% while reaching the target accuracy, and\nachieves up to 5.8% improvement in accuracy under non-IID scenarios compared to\nthe state-of-the-art baselines.\n","authors":["Yang Xu","Yunming Liao","Hongli Xu","Zhipeng Sun","Liusheng Huang","Chunming Qiao"],"pdf_url":"https://arxiv.org/pdf/2307.15870v3.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2407.15452v1","updated":"2024-07-22T08:09:36Z","published":"2024-07-22T08:09:36Z","title":"GraphScale: A Framework to Enable Machine Learning over Billion-node\n Graphs","summary":" Graph Neural Networks (GNNs) have emerged as powerful tools for supervised\nmachine learning over graph-structured data, while sampling-based node\nrepresentation learning is widely utilized in unsupervised learning. However,\nscalability remains a major challenge in both supervised and unsupervised\nlearning for large graphs (e.g., those with over 1 billion nodes). The\nscalability bottleneck largely stems from the mini-batch sampling phase in GNNs\nand the random walk sampling phase in unsupervised methods. These processes\noften require storing features or embeddings in memory. In the context of\ndistributed training, they require frequent, inefficient random access to data\nstored across different workers. Such repeated inter-worker communication for\neach mini-batch leads to high communication overhead and computational\ninefficiency.\n We propose GraphScale, a unified framework for both supervised and\nunsupervised learning to store and process large graph data distributedly. The\nkey insight in our design is the separation of workers who store data and those\nwho perform the training. This separation allows us to decouple computing and\nstorage in graph training, thus effectively building a pipeline where data\nfetching and data computation can overlap asynchronously. Our experiments show\nthat GraphScale outperforms state-of-the-art methods for distributed training\nof both GNNs and node embeddings. We evaluate GraphScale both on public and\nproprietary graph datasets and observe a reduction of at least 40% in\nend-to-end training times compared to popular distributed frameworks, without\nany loss in performance. While most existing methods don't support billion-node\ngraphs for training node embeddings, GraphScale is currently deployed in\nproduction at TikTok enabling efficient learning over such large graphs.\n","authors":["Vipul Gupta","Xin Chen","Ruoyun Huang","Fanlong Meng","Jianjun Chen","Yujun Yan"],"pdf_url":"https://arxiv.org/pdf/2407.15452v1.pdf","comment":"Published in the Proceedings of the 33rd ACM International Conference\n on Information and Knowledge Management (CIKM 2024), 8 Pages, 12 Figures"},{"id":"http://arxiv.org/abs/2407.00031v2","updated":"2024-07-22T07:01:48Z","published":"2024-05-21T21:22:16Z","title":"Supercharging Federated Learning with Flower and NVIDIA FLARE","summary":" Several open-source systems, such as Flower and NVIDIA FLARE, have been\ndeveloped in recent years while focusing on different aspects of federated\nlearning (FL). Flower is dedicated to implementing a cohesive approach to FL,\nanalytics, and evaluation. Over time, Flower has cultivated extensive\nstrategies and algorithms tailored for FL application development, fostering a\nvibrant FL community in research and industry. Conversely, FLARE has\nprioritized the creation of an enterprise-ready, resilient runtime environment\nexplicitly designed for FL applications in production environments. In this\npaper, we describe our initial integration of both frameworks and show how they\ncan work together to supercharge the FL ecosystem as a whole. Through the\nseamless integration of Flower and FLARE, applications crafted within the\nFlower framework can effortlessly operate within the FLARE runtime environment\nwithout necessitating any modifications. This initial integration streamlines\nthe process, eliminating complexities and ensuring smooth interoperability\nbetween the two platforms, thus enhancing the overall efficiency and\naccessibility of FL applications.\n","authors":["Holger R. Roth","Daniel J. Beutel","Yan Cheng","Javier Fernandez Marques","Heng Pan","Chester Chen","Zhihong Zhang","Yuhong Wen","Sean Yang"," Isaac"," Yang","Yuan-Ting Hsieh","Ziyue Xu","Daguang Xu","Nicholas D. Lane","Andrew Feng"],"pdf_url":"https://arxiv.org/pdf/2407.00031v2.pdf","comment":"Added a figure comparing running a Flower application natively or\n within FLARE"},{"id":"http://arxiv.org/abs/2404.12666v2","updated":"2024-07-22T06:52:46Z","published":"2024-04-19T07:06:40Z","title":"A Survey on Federated Analytics: Taxonomy, Enabling Techniques,\n Applications and Open Issues","summary":" The escalating influx of data generated by networked edge devices, coupled\nwith the growing awareness of data privacy, has restricted the traditional data\nanalytics workflow, where the edge data are gathered by a centralized server to\nbe further utilized by data analysts. To continue leveraging vast edge data to\nsupport various data-incentive applications, a transformative shift is promoted\nin computing paradigms from centralized data processing to privacy-preserved\ndistributed data processing. The need to perform data analytics on private edge\ndata motivates federated analytics (FA), an emerging technique to support\ncollaborative data analytics among diverse data owners without centralizing the\nraw data. Despite the wide applications of FA in industry and academia, a\ncomprehensive examination of existing research efforts in FA has been notably\nabsent. This survey aims to bridge this gap by first providing an overview of\nFA, elucidating key concepts, and discussing its relationship with similar\nconcepts. We then conduct a thorough examination of FA, including its key\nchallenges, taxonomy, and enabling techniques. Diverse FA applications,\nincluding statistical metrics, frequency-related applications, database query\noperations, FL-assisting FA tasks, and other wireless network applications are\nthen carefully reviewed. We complete the survey with several open research\nissues, future directions, and a comprehensive lessons learned part. This\nsurvey intends to provide a holistic understanding of the emerging FA\ntechniques and foster the continued evolution of privacy-preserving distributed\ndata processing in the emerging networked society.\n","authors":["Zibo Wang","Haichao Ji","Yifei Zhu","Dan Wang","Zhu Han"],"pdf_url":"https://arxiv.org/pdf/2404.12666v2.pdf","comment":"This survey has been submitted to IEEE Communications Surveys &\n Tutorials"},{"id":"http://arxiv.org/abs/2311.13348v2","updated":"2024-07-22T06:43:13Z","published":"2023-11-22T12:25:02Z","title":"MergeSFL: Split Federated Learning with Feature Merging and Batch Size\n Regulation","summary":" Recently, federated learning (FL) has emerged as a popular technique for edge\nAI to mine valuable knowledge in edge computing (EC) systems. To mitigate the\ncomputing/communication burden on resource-constrained workers and protect\nmodel privacy, split federated learning (SFL) has been released by integrating\nboth data and model parallelism. Despite resource limitations, SFL still faces\ntwo other critical challenges in EC, i.e., statistical heterogeneity and system\nheterogeneity. To address these challenges, we propose a novel SFL framework,\ntermed MergeSFL, by incorporating feature merging and batch size regulation in\nSFL. Concretely, feature merging aims to merge the features from workers into a\nmixed feature sequence, which is approximately equivalent to the features\nderived from IID data and is employed to promote model accuracy. While batch\nsize regulation aims to assign diverse and suitable batch sizes for\nheterogeneous workers to improve training efficiency. Moreover, MergeSFL\nexplores to jointly optimize these two strategies upon their coupled\nrelationship to better enhance the performance of SFL. Extensive experiments\nare conducted on a physical platform with 80 NVIDIA Jetson edge devices, and\nthe experimental results show that MergeSFL can improve the final model\naccuracy by 5.82% to 26.22%, with a speedup by about 1.74x to 4.14x, compared\nto the baselines.\n","authors":["Yunming Liao","Yang Xu","Hongli Xu","Lun Wang","Zhiwei Yao","Chunming Qiao"],"pdf_url":"https://arxiv.org/pdf/2311.13348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15406v1","updated":"2024-07-22T06:22:36Z","published":"2024-07-22T06:22:36Z","title":"Automated Road Safety: Enhancing Sign and Surface Damage Detection with\n AI","summary":" Public transportation plays a crucial role in our lives, and the road network\nis a vital component in the implementation of smart cities. Recent advancements\nin AI have enabled the development of advanced monitoring systems capable of\ndetecting anomalies in road surfaces and road signs, which, if unaddressed, can\nlead to serious road accidents. This paper presents an innovative approach to\nenhance road safety through the detection and classification of traffic signs\nand road surface damage using advanced deep learning techniques. This\nintegrated approach supports proactive maintenance strategies, improving road\nsafety and resource allocation for the Molise region and the city of\nCampobasso. The resulting system, developed as part of the Casa delle\nTecnologie Emergenti (House of Emergent Technologies) Molise (Molise CTE)\nresearch project funded by the Italian Minister of Economic Growth (MIMIT),\nleverages cutting-edge technologies such as Cloud Computing and High\nPerformance Computing with GPU utilization. It serves as a valuable tool for\nmunicipalities, enabling quick detection of anomalies and the prompt\norganization of maintenance operations\n","authors":["Davide Merolla","Vittorio Latorre","Antonio Salis","Gianluca Boanelli"],"pdf_url":"https://arxiv.org/pdf/2407.15406v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.15402v1","updated":"2024-07-22T06:08:13Z","published":"2024-07-22T06:08:13Z","title":"Tackling Selfish Clients in Federated Learning","summary":" Federated Learning (FL) is a distributed machine learning paradigm\nfacilitating participants to collaboratively train a model without revealing\ntheir local data. However, when FL is deployed into the wild, some intelligent\nclients can deliberately deviate from the standard training process to make the\nglobal model inclined toward their local model, thereby prioritizing their\nlocal data distribution. We refer to this novel category of misbehaving clients\nas selfish. In this paper, we propose a Robust aggregation strategy for FL\nserver to mitigate the effect of Selfishness (in short RFL-Self). RFL-Self\nincorporates an innovative method to recover (or estimate) the true updates of\nselfish clients from the received ones, leveraging robust statistics (median of\nnorms) of the updates at every round. By including the recovered updates in\naggregation, our strategy offers strong robustness against selfishness. Our\nexperimental results, obtained on MNIST and CIFAR-10 datasets, demonstrate that\njust 2% of clients behaving selfishly can decrease the accuracy by up to 36%,\nand RFL-Self can mitigate that effect without degrading the global model\nperformance.\n","authors":["Andrea Augello","Ashish Gupta","Giuseppe Lo Re","Sajal K. Das"],"pdf_url":"https://arxiv.org/pdf/2407.15402v1.pdf","comment":"10 pages, 16 figures. European Conference on Artificial Intelligence\n (ECAI) 2024"},{"id":"http://arxiv.org/abs/2407.15389v1","updated":"2024-07-22T05:34:47Z","published":"2024-07-22T05:34:47Z","title":"Poisoning with A Pill: Circumventing Detection in Federated Learning","summary":" Without direct access to the client's data, federated learning (FL) is\nwell-known for its unique strength in data privacy protection among existing\ndistributed machine learning techniques. However, its distributive and\niterative nature makes FL inherently vulnerable to various poisoning attacks.\nTo counteract these threats, extensive defenses have been proposed to filter\nout malicious clients, using various detection metrics. Based on our analysis\nof existing attacks and defenses, we find that there is a lack of attention to\nmodel redundancy. In neural networks, various model parameters contribute\ndifferently to the model's performance. However, existing attacks in FL\nmanipulate all the model update parameters with the same strategy, making them\neasily detectable by common defenses. Meanwhile, the defenses also tend to\nanalyze the overall statistical features of the entire model updates, leaving\nroom for sophisticated attacks. Based on these observations, this paper\nproposes a generic and attack-agnostic augmentation approach designed to\nenhance the effectiveness and stealthiness of existing FL poisoning attacks\nagainst detection in FL, pointing out the inherent flaws of existing defenses\nand exposing the necessity of fine-grained FL security. Specifically, we employ\na three-stage methodology that strategically constructs, generates, and injects\npoison (generated by existing attacks) into a pill (a tiny subnet with a novel\nstructure) during the FL training, named as pill construction, pill poisoning,\nand pill injection accordingly. Extensive experimental results show that FL\npoisoning attacks enhanced by our method can bypass all the popular defenses,\nand can gain an up to 7x error rate increase, as well as on average a more than\n2x error rate increase on both IID and non-IID data, in both cross-silo and\ncross-device FL systems.\n","authors":["Hanxi Guo","Hao Wang","Tao Song","Tianhang Zheng","Yang Hua","Haibing Guan","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.15389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16201v2","updated":"2024-07-22T22:33:07Z","published":"2024-02-25T21:29:44Z","title":"Honeybee: Decentralized Peer Sampling with Verifiable Random Walks for\n Blockchain Data Sharding","summary":" Data sharding$\\unicode{x2013}$in which block data is sharded without sharding\ncompute$\\unicode{x2013}$is at the present the favored approach for scaling\nEthereum and other popular blockchains. A key challenge toward implementing\ndata sharding is verifying whether the entirety of a block's data is available\nin the network (across its shards). A central technique proposed to conduct\nthis verification uses erasure-coded blocks and is called data availability\nsampling (DAS). While the high-level protocol details of DAS have been well\ndiscussed in the community, discussions around how such a protocol will be\nimplemented at the peer-to-peer layer are lacking. We identify random sampling\nof nodes as a fundamental primitive necessary to carry out DAS and present\nHoneybee, a decentralized algorithm for sampling nodes that uses verifiable\nrandom walks. Honeybee is secure against attacks even in the presence of a\nlarge number of Byzantine nodes (e.g., 50% of the network). We evaluate\nHoneybee through experiments and show that the quality of sampling achieved by\nHoneybee is significantly better compared to the state-of-the-art. Our proposed\nalgorithm has implications for DAS functions in both full nodes and light\nnodes.\n","authors":["Yunqi Zhang","Shaileshh Bojja Venkatakrishnan"],"pdf_url":"https://arxiv.org/pdf/2402.16201v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2407.15967v1","updated":"2024-07-22T18:27:29Z","published":"2024-07-22T18:27:29Z","title":"Versioned Analysis of Software Quality Indicators and Self-admitted\n Technical Debt in Ethereum Smart Contracts with Ethstractor","summary":" The rise of decentralized applications (dApps) has made smart contracts\nimperative components of blockchain technology. As many smart contracts process\nfinancial transactions, their security is paramount. Moreover, the immutability\nof blockchains makes vulnerabilities in smart contracts particularly\nchallenging because it requires deploying a new version of the contract at a\ndifferent address, incurring substantial fees paid in Ether. This paper\nproposes Ethstractor, the first smart contract collection tool for gathering a\ndataset of versioned smart contracts. The collected dataset is then used to\nevaluate the reliability of code metrics as indicators of vulnerabilities in\nsmart contracts. Our findings indicate that code metrics are ineffective in\nsignalling the presence of vulnerabilities. Furthermore, we investigate whether\nvulnerabilities in newer versions of smart contracts are mitigated and identify\nthat the number of vulnerabilities remains consistent over time. Finally, we\nexamine the removal of self-admitted technical debt in contracts and uncover\nthat most of the introduced debt has never been subsequently removed.\n","authors":["Khalid Hassan","Saeed Moradi","Shaiful Chowdhury","Sara Rouhani"],"pdf_url":"https://arxiv.org/pdf/2407.15967v1.pdf","comment":"Copyright 2024 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2407.15676v1","updated":"2024-07-22T14:43:05Z","published":"2024-07-22T14:43:05Z","title":"Preventing Out-of-Gas Exceptions by Typing","summary":" We continue the development of TinySol, a minimal object-oriented language\nbased on Solidity, the standard smart-contract language used for the Ethereum\nplatform. We first extend TinySol with exceptions and a gas mechanism, and\nequip it with a small-step operational semantics. Introducing the gas mechanism\nis fundamental for modelling real-life smart contracts in TinySol, since this\nis the way in which termination of Ethereum smart contracts is usually ensured.\nWe then devise a type system for smart contracts guaranteeing that such\nprograms never run out of gas at runtime. This is a desirable property for\nsmart contracts, since a transaction that runs out of gas is aborted, but the\nprice paid to run the code is not returned to the invoker.\n","authors":["Luca Aceto","Daniele Gorla","Stian Lybech","Mohammad Hamdaqa"],"pdf_url":"https://arxiv.org/pdf/2407.15676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15551v1","updated":"2024-07-22T11:26:04Z","published":"2024-07-22T11:26:04Z","title":"MoXIchecker: An Extensible Model Checker for MoXI","summary":" MoXI is a new intermediate verification language introduced in 2024 to\npromote the standardization and open-source implementations for symbolic model\nchecking by extending the SMT-LIB 2 language with constructs to define\nstate-transition systems. The tool suite of MoXI provides a translator from\nMoXI to Btor2, which is a lower-level intermediate language for hardware\nverification, and a translation-based model checker, which invokes mature\nhardware model checkers for Btor2 to analyze the translated verification tasks.\nThe extensibility of such a translation-based model checker is restricted\nbecause more complex theories, such as integer or real arithmetics, cannot be\nprecisely expressed with bit-vectors of fixed lengths in Btor2. We present\nMoXIchecker, the first model checker that solves MoXI verification tasks\ndirectly. Instead of translating MoXI to lower-level languages, MoXIchecker\nuses the solver-agnostic library PySMT for SMT solvers as backend for its\nverification algorithms. MoXIchecker is extensible because it accommodates\nverification tasks involving more complex theories, not limited by lower-level\nlanguages, facilitates the implementation of new algorithms, and is\nsolver-agnostic by using the API of PySMT. In our evaluation, MoXIchecker\nuniquely solved tasks that use integer or real arithmetics, and achieved a\ncomparable performance against the translation-based model checker from the\nMoXI tool suite.\n","authors":["Salih Ates","Dirk Beyer","Po-Chun Chien","Nian-Ze Lee"],"pdf_url":"https://arxiv.org/pdf/2407.15551v1.pdf","comment":"13 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.14787v2","updated":"2024-07-22T23:14:04Z","published":"2024-06-20T23:30:51Z","title":"Story of Your Lazy Function's Life: A Bidirectional Demand Semantics for\n Mechanized Cost Analysis of Lazy Programs","summary":" Lazy evaluation is a powerful tool that enables better compositionality and\npotentially better performance in functional programming, but it is challenging\nto analyze its computation cost. Existing works either require manually\nannotating sharing, or rely on separation logic to reason about heaps of\nmutable cells. In this paper, we propose a bidirectional demand semantics that\nallows for extrinsic reasoning about the computation cost of lazy programs\nwithout relying on special program logics. To show the effectiveness of our\napproach, we apply the demand semantics to a variety of case studies including\ninsertion sort, selection sort, Okasaki's banker's queue, and the implicit\nqueue. We formally prove that the banker's queue and the implicit queue are\nboth amortized and persistent using the Rocq Prover (formerly known as Coq). We\nalso propose the reverse physicist's method, a novel variant of the classical\nphysicist's method, which enables mechanized, modular and compositional\nreasoning about amortization and persistence with the demand semantics.\n","authors":["Li-yao Xia","Laura Israel","Maite Kramarz","Nicholas Coltharp","Koen Claessen","Stephanie Weirich","Yao Li"],"pdf_url":"https://arxiv.org/pdf/2406.14787v2.pdf","comment":"Accepted by ICFP 2024"}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2407.15440v1","updated":"2024-07-22T07:42:57Z","published":"2024-07-22T07:42:57Z","title":"The Bicameral Cache: a split cache for vector architectures","summary":" The Bicameral Cache is a cache organization proposal for a vector\narchitecture that segregates data according to their access type,\ndistinguishing scalar from vector references. Its aim is to avoid both types of\nreferences from interfering in each other's data locality, with a special focus\non prioritizing the performance on vector references. The proposed system\nincorporates an additional, non-polluting prefetching mechanism to help\npopulate the long vector cache lines in advance to increase the hit rate by\nfurther exploiting the spatial locality on vector data. Its evaluation was\nconducted on the Cavatools simulator, comparing the performance to a standard\nconventional cache, over different typical vector benchmarks for several vector\nlengths. The results proved the proposed cache speeds up performance on\nstride-1 vector benchmarks, while hardly impacting non-stride-1's. In addition,\nthe prefetching feature consistently provided an additional value.\n","authors":["Susana Rebolledo Ruiz","Borja Perez","Jose Luis Bosque","Peter Hsu"],"pdf_url":"https://arxiv.org/pdf/2407.15440v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.16026v1","updated":"2024-07-22T20:07:21Z","published":"2024-07-22T20:07:21Z","title":"KWT-Tiny: RISC-V Accelerated, Embedded Keyword Spotting Transformer","summary":" This paper explores the adaptation of Transformerbased models for edge\ndevices through the quantisation and hardware acceleration of the ARM Keyword\nTransformer (KWT) model on a RISC-V platform. The model was targeted to run on\n64kB RAM in bare-metal C using a custom-developed edge AI library. KWT-1 was\nretrained to be 369 times smaller, with only a 10% loss in accuracy through\nreducing output classes from 35 to 2. The retraining and quantisation reduced\nmodel size from 2.42 MB to 1.65 kB. The integration of custom RISC-V\ninstructions that accelerated GELU and SoftMax operations enabled a 5x speedup\nand thus ~5x power reduction in inference, with inference clock cycle counts\ndecreasing from 26 million to 5.5 million clock cycles while incurring a small\narea overhead of approximately 29%. The results demonstrate a viable method for\nporting and accelerating Transformer-based models in low-power IoT devices.\n","authors":["Aness Al-Qawlaq","Ajay Kumar M","Deepu John"],"pdf_url":"https://arxiv.org/pdf/2407.16026v1.pdf","comment":"6 pages, 7 figures, accepted to be published in the IEEE SOCC 2024\n conference"}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2407.15780v1","updated":"2024-07-22T16:37:48Z","published":"2024-07-22T16:37:48Z","title":"Explaining Decisions in ML Models: a Parameterized Complexity Analysis","summary":" This paper presents a comprehensive theoretical investigation into the\nparameterized complexity of explanation problems in various machine learning\n(ML) models. Contrary to the prevalent black-box perception, our study focuses\non models with transparent internal mechanisms. We address two principal types\nof explanation problems: abductive and contrastive, both in their local and\nglobal variants. Our analysis encompasses diverse ML models, including Decision\nTrees, Decision Sets, Decision Lists, Ordered Binary Decision Diagrams, Random\nForests, and Boolean Circuits, and ensembles thereof, each offering unique\nexplanatory challenges. This research fills a significant gap in explainable AI\n(XAI) by providing a foundational understanding of the complexities of\ngenerating explanations for these models. This work provides insights vital for\nfurther research in the domain of XAI, contributing to the broader discourse on\nthe necessity of transparency and accountability in AI systems.\n","authors":["Sebastian Ordyniak","Giacomo Paesani","Mateusz Rychlicki","Stefan Szeider"],"pdf_url":"https://arxiv.org/pdf/2407.15780v1.pdf","comment":"A short version of the paper has been accepted at the 21st\n International Conference on Principles of Knowledge Representation and\n Reasoning (KR 2024)"},{"id":"http://arxiv.org/abs/2205.02168v3","updated":"2024-07-22T23:48:11Z","published":"2022-05-04T16:36:23Z","title":"Separations in Proof Complexity and TFNP","summary":" It is well-known that Resolution proofs can be efficiently simulated by\nSherali-Adams (SA) proofs. We show, however, that any such simulation needs to\nexploit huge coefficients: Resolution cannot be efficiently simulated by SA\nwhen the coefficients are written in unary. We also show that Reversible\nResolution (a variant of MaxSAT Resolution) cannot be efficiently simulated by\nNullstellensatz (NS).\n These results have consequences for total NP search problems. First, we\ncharacterise the classes PPADS, PPAD, SOPL by unary-SA, unary-NS, and\nReversible Resolution, respectively. Second, we show that, relative to an\noracle, PLS $\\not\\subseteq$ PPP, SOPL $\\not\\subseteq$ PPA, and EOPL\n$\\not\\subseteq$ UEOPL. In particular, together with prior work, this gives a\ncomplete picture of the black-box relationships between all classical TFNP\nclasses introduced in the 1990s.\n","authors":["Mika Göös","Alexandros Hollender","Siddhartha Jain","Gilbert Maystre","William Pires","Robert Robere","Ran Tao"],"pdf_url":"https://arxiv.org/pdf/2205.02168v3.pdf","comment":null}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2304.07687v3","updated":"2024-07-22T00:40:17Z","published":"2023-04-16T03:49:50Z","title":"MLRegTest: A Benchmark for the Machine Learning of Regular Languages","summary":" Synthetic datasets constructed from formal languages allow fine-grained\nexamination of the learning and generalization capabilities of machine learning\nsystems for sequence classification. This article presents a new benchmark for\nmachine learning systems on sequence classification called MLRegTest, which\ncontains training, development, and test sets from 1,800 regular languages.\nDifferent kinds of formal languages represent different kinds of long-distance\ndependencies, and correctly identifying long-distance dependencies in sequences\nis a known challenge for ML systems to generalize successfully. MLRegTest\norganizes its languages according to their logical complexity (monadic second\norder, first order, propositional, or monomial expressions) and the kind of\nlogical literals (string, tier-string, subsequence, or combinations thereof).\nThe logical complexity and choice of literal provides a systematic way to\nunderstand different kinds of long-distance dependencies in regular languages,\nand therefore to understand the capacities of different ML systems to learn\nsuch long-distance dependencies. Finally, the performance of different neural\nnetworks (simple RNN, LSTM, GRU, transformer) on MLRegTest is examined. The\nmain conclusion is that performance depends significantly on the kind of test\nset, the class of language, and the neural network architecture.\n","authors":["Sam van der Poel","Dakotah Lambert","Kalina Kostyszyn","Tiantian Gao","Rahul Verma","Derek Andersen","Joanne Chau","Emily Peterson","Cody St. Clair","Paul Fodor","Chihiro Shibata","Jeffrey Heinz"],"pdf_url":"https://arxiv.org/pdf/2304.07687v3.pdf","comment":"43 pages, MLRegTest benchmark available at\n https://doi.org/10.5061/dryad.dncjsxm4h , associated code at\n https://github.com/heinz-jeffrey/subregular-learning"}]},"2024-07-21T00:00:00Z":{"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2407.13658v2","updated":"2024-07-21T18:32:21Z","published":"2024-07-18T16:32:48Z","title":"DPDPU: Data Processing with DPUs","summary":" Improving the performance and reducing the cost of cloud data systems is\nincreasingly challenging. Data processing units (DPUs) are a promising\nsolution, but utilizing them for data processing needs characterizing the new\nhardware and recognizing their capabilities and constraints. We hence propose\nDPDPU, a platform for holistically exploiting DPUs to optimize data processing\ntasks that are critical to performance and cost. It seeks to fill the semantic\ngap between DPUs and data processing systems and handle DPU heterogeneity with\nthree engines dedicated to compute, networking, and storage. This paper\ndescribes our vision, DPDPU's key components, their associated utilization\nchallenges, as well as the current progress and future plans.\n","authors":["Jiasheng Hu","Philip A. Bernstein","Jialin Li","Qizhen Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.13658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15264v1","updated":"2024-07-21T20:41:39Z","published":"2024-07-21T20:41:39Z","title":"LSM-GNN: Large-scale Storage-based Multi-GPU GNN Training by Optimizing\n Data Transfer Scheme","summary":" Graph Neural Networks (GNNs) are widely used today in recommendation systems,\nfraud detection, and node/link classification tasks. Real world GNNs continue\nto scale in size and require a large memory footprint for storing graphs and\nembeddings that often exceed the memory capacities of the target GPUs used for\ntraining. To address limited memory capacities, traditional GNN training\napproaches use graph partitioning and sharding techniques to scale up across\nmultiple GPUs within a node and/or scale out across multiple nodes. However,\nthis approach suffers from the high computational costs of graph partitioning\nalgorithms and inefficient communication across GPUs.\n To address these overheads, we propose Large-scale Storage-based Multi-GPU\nGNN framework (LSM-GNN), a storagebased approach to train GNN models that\nutilizes a novel communication layer enabling GPU software caches to function\nas a system-wide shared cache with low overheads.LSM-GNN incorporates a hybrid\neviction policy that intelligently manages cache space by using both static and\ndynamic node information to significantly enhance cache performance.\nFurthermore, we introduce the Preemptive Victim-buffer Prefetcher (PVP), a\nmechanism for prefetching node feature data from a Victim Buffer located in CPU\npinned-memory to further reduce the pressure on the storage devices.\nExperimental results show that despite the lower compute capabilities and\nmemory capacities, LSM-GNN in a single node with two GPUs offers superior\nperformance over two-node-four-GPU Dist-DGL baseline and provides up to 3.75x\nspeed up on end-to-end epoch time while running large-scale GNN training\n","authors":["Jeongmin Brian Park","Kun Wu","Vikram Sharma Mailthody","Zaid Quresh","Scott Mahlke","Wen-mei Hwu"],"pdf_url":"https://arxiv.org/pdf/2407.15264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15221v1","updated":"2024-07-21T17:18:01Z","published":"2024-07-21T17:18:01Z","title":"Secure Web Objects: Building Blocks for Metaverse Interoperability and\n Decentralization","summary":" This position paper explores how to support the Web's evolution through an\nunderlying data-centric approach that better matches the data-orientedness of\nmodern and emerging applications. We revisit the original vision of the Web as\na hypermedia system that supports document composability and application\ninteroperability via name-based data access. We propose the use of secure web\nobjects (SWO), a data-oriented communication approach that can reduce\ncomplexity, centrality, and inefficiency, particularly for collaborative and\nlocal-first applications, such as the Metaverse and other collaborative\napplications. SWO are named, signed, application-defined objects that are\nsecured independently of their containers or communications channels, an\napproach that leverages the results from over a decade-long data-centric\nnetworking research. This approach does not require intermediation by\naggregators of identity, storage, and other services that are common today. We\npresent a brief design overview, illustrated through prototypes for two editors\nof shared hypermedia documents: one for 3D and one for LaTeX. We also discuss\nour findings and suggest a roadmap for future research.\n","authors":["Tianyuan Yu","Xinyu Ma","Varun Patil","Yekta Kocaogullar","Yulong Zhang","Jeff Burke","Dirk Kutscher","Lixia Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.15221v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2212.00250v3","updated":"2024-07-21T11:47:04Z","published":"2022-12-01T03:35:14Z","title":"Split Learning without Local Weight Sharing to Enhance Client-side Data\n Privacy","summary":" Split learning (SL) aims to protect user data privacy by distributing deep\nmodels between client-server and keeping private data locally. In SL training\nwith multiple clients, the local model weights are shared among the clients for\nlocal model update. This paper first reveals data privacy leakage exacerbated\nfrom local weight sharing among the clients in SL through model inversion\nattacks. Then, to reduce the data privacy leakage issue, we propose and analyze\nprivacy-enhanced SL (P-SL) (or SL without local weight sharing). We further\npropose parallelized P-SL to expedite the training process by duplicating\nmultiple server-side model instances without compromising accuracy. Finally, we\nexplore P-SL with late participating clients and devise a server-side\ncache-based training method to address the forgetting phenomenon in SL when\nlate clients join. Experimental results demonstrate that P-SL helps reduce up\nto 50% of client-side data leakage, which essentially achieves a better\nprivacy-accuracy trade-off than the current trend by using differential privacy\nmechanisms. Moreover, P-SL and its cache-based version achieve comparable\naccuracy to baseline SL under various data distributions, while cost less\ncomputation and communication. Additionally, caching-based training in P-SL\nmitigates the negative effect of forgetting, stabilizes the learning, and\nenables practical and low-complexity training in a dynamic environment with\nlate-arriving clients.\n","authors":["Ngoc Duy Pham","Tran Khoa Phan","Alsharif Abuadbba","Yansong Gao","Doan Nguyen","Naveen Chilamkurti"],"pdf_url":"https://arxiv.org/pdf/2212.00250v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12195v3","updated":"2024-07-21T11:01:00Z","published":"2023-01-28T13:34:36Z","title":"BAFFLE: A Baseline of Backpropagation-Free Federated Learning","summary":" Federated learning (FL) is a general principle for decentralized clients to\ntrain a server model collectively without sharing local data. FL is a promising\nframework with practical applications, but its standard training paradigm\nrequires the clients to backpropagate through the model to compute gradients.\nSince these clients are typically edge devices and not fully trusted, executing\nbackpropagation on them incurs computational and storage overhead as well as\nwhite-box vulnerability. In light of this, we develop backpropagation-free\nfederated learning, dubbed BAFFLE, in which backpropagation is replaced by\nmultiple forward processes to estimate gradients. BAFFLE is 1) memory-efficient\nand easily fits uploading bandwidth; 2) compatible with inference-only hardware\noptimization and model quantization or pruning; and 3) well-suited to trusted\nexecution environments, because the clients in BAFFLE only execute forward\npropagation and return a set of scalars to the server. Empirically we use\nBAFFLE to train deep models from scratch or to finetune pretrained models,\nachieving acceptable results. Code is available in\nhttps://github.com/FengHZ/BAFFLE.\n","authors":["Haozhe Feng","Tianyu Pang","Chao Du","Wei Chen","Shuicheng Yan","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2301.12195v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.15037v1","updated":"2024-07-21T02:32:50Z","published":"2024-07-21T02:32:50Z","title":"Lessons Learned on the Path to Guaranteeing the Error Bound in Lossy\n Quantizers","summary":" Rapidly increasing data sizes in scientific computing are the driving force\nbehind the need for lossy compression. The main drawback of lossy data\ncompression is the introduction of error. This paper explains why many\nerror-bounded compressors occasionally violate the error bound and presents the\nsolutions we use in LC, a CPU/GPU compatible lossy compression framework, to\nguarantee the error bound for all supported types of quantizers. We show that\nour solutions maintain high compression ratios and cause no appreciable change\nin throughput.\n","authors":["Alex Fallin","Martin Burtscher"],"pdf_url":"https://arxiv.org/pdf/2407.15037v1.pdf","comment":"12 pages, 4 figures, 9 tables, presented at the CAV 2024 Workshop on\n Correct Data Compression"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2209.10517v9","updated":"2024-07-21T17:27:23Z","published":"2022-09-21T17:33:48Z","title":"On Quantum Pushdown Systems, Extensions","summary":" In this paper, we first define the quantum analogues of the {\\em\nprobabilistic pushdown systems} and {\\em Markov chains}, and investigate the\nquestion whether it is necessary to define a quantum analogue of {\\em\nprobabilistic computational tree logic} to describe the probabilistic and\nbranching-time properties of the {\\em quantum Markov chain} defined here. We\nstudy its model-checking question and show that the model-checking of {\\em\nstateless quantum pushdown systems (qBPA)} against {\\em probabilistic\ncomputational tree logic (PCTL)} is generally undecidable.\n We next define the notion of {\\em probabilistic $\\omega$-pushdown automaton}\nfor the first time and study the model-checking question of {\\em stateless\nprobabilistic $\\omega$-pushdown system ($\\omega$-pBPA)} against $\\omega$-PCTL\n(defined by Chatterjee et al. in \\cite{CSH08}) and show that the model-checking\nof {\\em stateless probabilistic $\\omega$-pushdown systems ($\\omega$-pBPA)}\nagainst $\\omega$-PCTL is generally undecidable. Our approach is to construct\nformulas of $\\omega$-PCTL encoding the {\\em Post Correspondence Problem}\nindirectly.\n","authors":["Deren Lin","Tianrong Lin"],"pdf_url":"https://arxiv.org/pdf/2209.10517v9.pdf","comment":"[v9] not finished; comments are welcome. arXiv admin note:\n substantial text overlap with arXiv:1405.4806"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2407.12677v3","updated":"2024-07-21T11:52:23Z","published":"2024-07-17T15:58:54Z","title":"Tree algebras and bisimulation-invariant MSO on finite graphs","summary":" We establish that the bisimulation invariant fragment of MSO over finite\ntransition systems is expressively equivalent over finite transition systems to\nmodal mu-calculus, a question that had remained open for several decades. The\nproof goes by translating the question to an algebraic framework, and showing\nthat the languages of regular trees that are recognized by finitary tree\nalgebras whose sorts zero and one are finite are the regular ones, ie. the ones\nexpressible in mu-calculus. This corresponds for trees to a weak form of the\nkey translation of Wilke algebras to omega-semigroup over infinite words, and\nwas also a missing piece in the algebraic theory of regular languages of\ninfinite trees since twenty years.\n","authors":["Thomas Colcombet","Amina Doumane","Denis Kuperberg"],"pdf_url":"https://arxiv.org/pdf/2407.12677v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11779v9","updated":"2024-07-21T18:30:20Z","published":"2024-06-17T17:34:25Z","title":"Compact Proofs of Model Performance via Mechanistic Interpretability","summary":" We propose using mechanistic interpretability -- techniques for reverse\nengineering model weights into human-interpretable algorithms -- to derive and\ncompactly prove formal guarantees on model performance. We prototype this\napproach by formally proving lower bounds on the accuracy of 151 small\ntransformers trained on a Max-of-$K$ task. We create 102 different\ncomputer-assisted proof strategies and assess their length and tightness of\nbound on each of our models. Using quantitative metrics, we find that shorter\nproofs seem to require and provide more mechanistic understanding. Moreover, we\nfind that more faithful mechanistic understanding leads to tighter performance\nbounds. We confirm these connections by qualitatively examining a subset of our\nproofs. Finally, we identify compounding structureless noise as a key challenge\nfor using mechanistic interpretability to generate compact proofs on model\nperformance.\n","authors":["Jason Gross","Rajashree Agrawal","Thomas Kwa","Euan Ong","Chun Hei Yip","Alex Gibson","Soufiane Noubir","Lawrence Chan"],"pdf_url":"https://arxiv.org/pdf/2406.11779v9.pdf","comment":"accepted to ICML 2024 Workshop on Mechanistic Interpretability\n (Spotlight)"},{"id":"http://arxiv.org/abs/2209.10517v9","updated":"2024-07-21T17:27:23Z","published":"2022-09-21T17:33:48Z","title":"On Quantum Pushdown Systems, Extensions","summary":" In this paper, we first define the quantum analogues of the {\\em\nprobabilistic pushdown systems} and {\\em Markov chains}, and investigate the\nquestion whether it is necessary to define a quantum analogue of {\\em\nprobabilistic computational tree logic} to describe the probabilistic and\nbranching-time properties of the {\\em quantum Markov chain} defined here. We\nstudy its model-checking question and show that the model-checking of {\\em\nstateless quantum pushdown systems (qBPA)} against {\\em probabilistic\ncomputational tree logic (PCTL)} is generally undecidable.\n We next define the notion of {\\em probabilistic $\\omega$-pushdown automaton}\nfor the first time and study the model-checking question of {\\em stateless\nprobabilistic $\\omega$-pushdown system ($\\omega$-pBPA)} against $\\omega$-PCTL\n(defined by Chatterjee et al. in \\cite{CSH08}) and show that the model-checking\nof {\\em stateless probabilistic $\\omega$-pushdown systems ($\\omega$-pBPA)}\nagainst $\\omega$-PCTL is generally undecidable. Our approach is to construct\nformulas of $\\omega$-PCTL encoding the {\\em Post Correspondence Problem}\nindirectly.\n","authors":["Deren Lin","Tianrong Lin"],"pdf_url":"https://arxiv.org/pdf/2209.10517v9.pdf","comment":"[v9] not finished; comments are welcome. arXiv admin note:\n substantial text overlap with arXiv:1405.4806"},{"id":"http://arxiv.org/abs/2407.15192v1","updated":"2024-07-21T15:12:19Z","published":"2024-07-21T15:12:19Z","title":"Error Detection and Constraint Recovery in Hierarchical Multi-Label\n Classification without Prior Knowledge","summary":" Recent advances in Hierarchical Multi-label Classification (HMC),\nparticularly neurosymbolic-based approaches, have demonstrated improved\nconsistency and accuracy by enforcing constraints on a neural model during\ntraining. However, such work assumes the existence of such constraints\na-priori. In this paper, we relax this strong assumption and present an\napproach based on Error Detection Rules (EDR) that allow for learning\nexplainable rules about the failure modes of machine learning models. We show\nthat these rules are not only effective in detecting when a machine learning\nclassifier has made an error but also can be leveraged as constraints for HMC,\nthereby allowing the recovery of explainable constraints even if they are not\nprovided. We show that our approach is effective in detecting machine learning\nerrors and recovering constraints, is noise tolerant, and can function as a\nsource of knowledge for neurosymbolic models on multiple datasets, including a\nnewly introduced military vehicle recognition dataset.\n","authors":["Joshua Shay Kricheli","Khoa Vo","Aniruddha Datta","Spencer Ozgur","Paulo Shakarian"],"pdf_url":"https://arxiv.org/pdf/2407.15192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15118v1","updated":"2024-07-21T11:03:16Z","published":"2024-07-21T11:03:16Z","title":"A Cobham theorem for scalar multiplication","summary":" Let $\\alpha,\\beta \\in \\mathbb{R}_{>0}$ be such that $\\alpha,\\beta$ are\nquadratic and $\\mathbb{Q}(\\alpha)\\neq \\mathbb{Q}(\\beta)$. Then every subset of\n$\\mathbb{R}^n$ definable in both $(\\mathbb{R},{<},+,\\mathbb{Z},x\\mapsto \\alpha\nx)$ and $(\\mathbb{R},{<},+,\\mathbb{Z},x\\mapsto \\beta x)$ is already definable\nin $(\\mathbb{R},{<},+,\\mathbb{Z})$. As a consequence we generalize\nCobham-Semenov theorems for sets of real numbers to $\\beta$-numeration systems,\nwhere $\\beta$ is a quadratic irrational.\n","authors":["Philipp Hieronymi","Sven Manthe","Chris Schulz"],"pdf_url":"https://arxiv.org/pdf/2407.15118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07382v2","updated":"2024-07-21T00:10:34Z","published":"2024-04-10T23:01:45Z","title":"Learn from Failure: Fine-Tuning LLMs with Trial-and-Error Data for\n Intuitionistic Propositional Logic Proving","summary":" Recent advances in Automated Theorem Proving have shown the effectiveness of\nleveraging a (large) language model that generates tactics (i.e. proof steps)\nto search through proof states. The current model, while trained solely on\nsuccessful proof paths, faces a discrepancy at the inference stage, as it must\nsample and try various tactics at each proof state until finding success,\nunlike its training which does not incorporate learning from failed attempts.\nIntuitively, a tactic that leads to a failed search path would indicate that\nsimilar tactics should receive less attention during the following trials. In\nthis paper, we demonstrate the benefit of training models that additionally\nlearn from failed search paths. Facing the lack of such trial-and-error data in\nexisting open-source theorem-proving datasets, we curate a dataset on\nintuitionistic propositional logic theorems and formalize it in Lean, such that\nwe can reliably check the correctness of proofs. We compare our model trained\non relatively short trial-and-error information (TrialMaster) with models\ntrained only on the correct paths and discover that the former solves more\nunseen theorems with lower trial searches.\n","authors":["Chenyang An","Zhibo Chen","Qihao Ye","Emily First","Letian Peng","Jiayun Zhang","Zihan Wang","Sorin Lerner","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2404.07382v2.pdf","comment":"Accepted as a main conference paper at ACL 2024"}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2407.15131v1","updated":"2024-07-21T11:56:54Z","published":"2024-07-21T11:56:54Z","title":"Token-Picker: Accelerating Attention in Text Generation with Minimized\n Memory Transfer via Probability Estimation","summary":" The attention mechanism in text generation is memory-bounded due to its\nsequential characteristics. Therefore, off-chip memory accesses should be\nminimized for faster execution. Although previous methods addressed this by\npruning unimportant tokens, they fall short in selectively removing tokens with\nnear-zero attention probabilities in each instance. Our method estimates the\nprobability before the softmax function, effectively removing low probability\ntokens and achieving an 12.1x pruning ratio without fine-tuning. Additionally,\nwe present a hardware design supporting seamless on-demand off-chip access. Our\napproach shows 2.6x reduced memory accesses, leading to an average 2.3x speedup\nand a 2.4x energy efficiency.\n","authors":["Junyoung Park","Myeonggu Kang","Yunki Han","Yanggon Kim","Jaekang Shin","Lee-Sup Kim"],"pdf_url":"https://arxiv.org/pdf/2407.15131v1.pdf","comment":"To appear in the proceedings of 61st Design Automation Conference\n (DAC)"},{"id":"http://arxiv.org/abs/2407.18333v1","updated":"2024-07-21T16:42:45Z","published":"2024-07-21T16:42:45Z","title":"AutoVCoder: A Systematic Framework for Automated Verilog Code Generation\n using LLMs","summary":" Recently, the use of large language models (LLMs) for software code\ngeneration, e.g., C/C++ and Python, has proven a great success. However, LLMs\nstill suffer from low syntactic and functional correctness when it comes to the\ngeneration of register-transfer level (RTL) code, such as Verilog. To address\nthis issue, in this paper, we develop AutoVCoder, a systematic open-source\nframework that significantly improves the LLMs' correctness of generating\nVerilog code and enhances the quality of its output at the same time. Our\nframework integrates three novel techniques, including a high-quality hardware\ndataset generation approach, a two-round LLM fine-tuning method and a\ndomain-specific retrieval-augmented generation (RAG) mechanism. Experimental\nresults demonstrate that AutoVCoder outperforms both industrial and academic\nLLMs in Verilog code generation. Specifically, AutoVCoder shows a 0.5% and 2.2%\nimprovement in functional correctness on the EvalMachine and EvalHuman\nbenchmarks compared with BetterV, and also achieves a 3.4% increase in syntax\ncorrectness and a 3.4% increase in functional correctness on the RTLLM\nbenchmark compared with RTLCoder.\n","authors":["Mingzhe Gao","Jieru Zhao","Zhe Lin","Wenchao Ding","Xiaofeng Hou","Yu Feng","Chao Li","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2407.18333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18271v1","updated":"2024-07-21T11:25:21Z","published":"2024-07-21T11:25:21Z","title":"Large Language Model for Verilog Generation with Golden Code Feedback","summary":" Recent advancements in large language models (LLMs) have catalyzed\nsignificant interest in the automatic generation of Register-Transfer Level\n(RTL) code, particularly Verilog, from natural language instructions. While\ncommercial LLMs like ChatGPT have dominated this domain, open-source\nalternatives have lagged considerably in performance, limiting the flexibility\nand data privacy of this emerging technology. This study introduces a novel\napproach utilizing reinforcement learning with golden code feedback to enhance\nthe performance of pre-trained models. Leveraging open-source data and base\nmodels, we have achieved state-of-the-art (SOTA) results with a substantial\nmargin. Notably, our 6.7B parameter model \\ours{} demonstrates superior\nperformance compared to current best-in-class 13B and 16B models. Furthermore,\nthrough a comprehensive analysis of the limitations in direct fine-tuning and\nthe training dynamics of reinforcement learning, we posit that the development\nof comprehensive supervisory signals, which are align with the inherent\nparallel semantics of Verilog code, is critical to effective generation. The\ncode and data associated with this research are publicly available at\n\\url{https://github.com/CatIIIIIIII/veriseek}. The model weights can be\naccessed at \\url{https://huggingface.co/WANGNingroci/VeriSeek}.\n","authors":["Ning Wang","Bingkun Yao","Jie Zhou","Xi Wang","Zhe Jiang","Nan Guan"],"pdf_url":"https://arxiv.org/pdf/2407.18271v1.pdf","comment":null}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2311.07172v2","updated":"2024-07-21T12:41:18Z","published":"2023-11-13T09:06:58Z","title":"VerityMath: Advancing Mathematical Reasoning by Self-Verification\n Through Unit Consistency","summary":" Large Language Models (LLMs), combined with program-based solving techniques,\nare increasingly demonstrating proficiency in mathematical reasoning. For\nexample, closed-source models such as OpenAI GPT-4 and Claude show excellent\nresults in solving math word problems. However, progress in math word\nproblem-solving for open-source LLMs is limited, and the challenges these\nmodels face are not well-studied. In this paper, we study the performance of\nstrong open-source LLMs, including Llama 2 (7B), Code Llama (7B), and Mistral\n(7B) on math word problems using program-based solving techniques.\nSpecifically, we analyze the outputs of these models when applied to math word\nproblems and identify a category of problems that pose a significant challenge,\nparticularly those involving quantities spanning multiple units. To address\nthis issue, we propose a systematic approach by defining the units for each\nquantity and ensuring the consistency of these units during mathematical\noperations. We developed Unit Consistency Programs (UCPs), an annotated dataset\nof math word problems, each paired with programs containing unit specifications\nand unit verification routines. We fine-tuned Llama 2 (7B), Code Llama (7B),\nand Mistral (7B) models with UCPs to produce theirVerityMath variants. Our\nfindings indicate that our approach, which incorporates unit consistency,\ncurrently slightly underperforms compared to an approach that does not. To\nunderstand the reasons behind this, we conduct an in-depth error analysis and\nsuggest options for future improvements. Our code and dataset are available at\nhttps://github.com/vernontoh/VerityMath.\n","authors":["Vernon Toh Yan Han","Ratish Puduppully","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2311.07172v2.pdf","comment":"AI4MATH Workshop @ ICML 2024"},{"id":"http://arxiv.org/abs/2407.15080v1","updated":"2024-07-21T07:30:30Z","published":"2024-07-21T07:30:30Z","title":"SNIP: Speculative Execution and Non-Interference Preservation for\n Compiler Transformations","summary":" We address the problem of preserving non-interference across compiler\ntransformations under speculative semantics. We develop a proof method that\nensures the preservation uniformly across all source programs. The basis of our\nproof method is a new form of simulation relation. It operates over directives\nthat model the attacker's control over the micro-architectural state, and it\naccounts for the fact that the compiler transformation may change the influence\nof the micro-architectural state on the execution (and hence the directives).\nUsing our proof method, we show the correctness of dead code elimination. When\nwe tried to prove register allocation correct, we identified a previously\nunknown weakness that introduces violations to non-interference. We have\nconfirmed the weakness for a mainstream compiler on code from the libsodium\ncryptographic library. To reclaim security once more, we develop a novel static\nanalysis that operates on a product of source program and register-allocated\nprogram. Using the analysis, we present an automated fix to existing register\nallocation implementations. We prove the correctness of the fixed register\nallocations with our proof method.\n","authors":["Sören van der Wall","Roland Meyer"],"pdf_url":"https://arxiv.org/pdf/2407.15080v1.pdf","comment":null}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2407.15193v1","updated":"2024-07-21T15:20:03Z","published":"2024-07-21T15:20:03Z","title":"The Complexity of (P3, H)-Arrowing and Beyond","summary":" Often regarded as the study of how order emerges from randomness, Ramsey\ntheory has played an important role in mathematics and computer science, giving\nrise to applications in numerous domains such as logic, parallel processing,\nand number theory. The core of graph Ramsey theory is arrowing: For fixed\ngraphs $F$ and $H$, the $(F, H)$-Arrowing problem asks whether a given graph,\n$G$, has a red/blue coloring of the edges of $G$ such that there are no red\ncopies of $F$ and no blue copies of $H$. For some cases, the problem has been\nshown to be coNP-complete, or solvable in polynomial time. However, a more\nsystematic approach is needed to categorize the complexity of all cases.\n We focus on $(P_3, H)$-Arrowing as $F = P_3$ is the simplest meaningful case\nfor which the complexity question remains open, and the hardness for this case\nlikely extends to general $(F, H)$-Arrowing for nontrivial $F$. In this\npursuit, we also gain insight into the complexity of a class of matching\nremoval problems, since $(P_3, H)$-Arrowing is equivalent to $H$-free Matching\nRemoval. We show that $(P_3, H)$-Arrowing is coNP-complete for all\n$2$-connected $H$ except when $H = K_3$, in which case the problem is in P. We\nintroduce a new graph invariant to help us carefully combine graphs when\nconstructing the gadgets for our reductions. Moreover, we show how\n$(P_3,H)$-Arrowing hardness results can be extended to other $(F,H)$-Arrowing\nproblems. This allows for more intuitive and palatable hardness proofs instead\nof ad-hoc constructions of SAT gadgets, bringing us closer to categorizing the\ncomplexity of all $(F, H)$-Arrowing problems.\n","authors":["Zohair Raza Hassan"],"pdf_url":"https://arxiv.org/pdf/2407.15193v1.pdf","comment":"To appear in MFCS 2024"}]},"2024-07-23T00:00:00Z":{"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2407.15805v2","updated":"2024-07-23T10:27:44Z","published":"2024-07-22T17:18:26Z","title":"A simple and fast C++ thread pool implementation capable of running task\n graphs","summary":" In this paper, the author presents a simple and fast C++ thread pool\nimplementation capable of running task graphs. The implementation is publicly\navailable on GitHub, see https://github.com/dpuyda/scheduling.\n","authors":["Dmytro Puyda"],"pdf_url":"https://arxiv.org/pdf/2407.15805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15870v4","updated":"2024-07-23T15:30:32Z","published":"2023-07-29T02:35:37Z","title":"SemiSFL: Split Federated Learning on Unlabeled and Non-IID Data","summary":" Federated Learning (FL) has emerged to allow multiple clients to\ncollaboratively train machine learning models on their private data at the\nnetwork edge. However, training and deploying large-scale models on\nresource-constrained devices is challenging. Fortunately, Split Federated\nLearning (SFL) offers a feasible solution by alleviating the computation and/or\ncommunication burden on clients. However, existing SFL works often assume\nsufficient labeled data on clients, which is usually impractical. Besides, data\nnon-IIDness poses another challenge to ensure efficient model training. To our\nbest knowledge, the above two issues have not been simultaneously addressed in\nSFL. Herein, we propose a novel Semi-supervised SFL system, termed SemiSFL,\nwhich incorporates clustering regularization to perform SFL with unlabeled and\nnon-IID client data. Moreover, our theoretical and experimental investigations\ninto model convergence reveal that the inconsistent training processes on\nlabeled and unlabeled data have an influence on the effectiveness of clustering\nregularization. To mitigate the training inconsistency, we develop an algorithm\nfor dynamically adjusting the global updating frequency, so as to improve\ntraining performance. Extensive experiments on benchmark models and datasets\nshow that our system provides a 3.8x speed-up in training time, reduces the\ncommunication cost by about 70.3% while reaching the target accuracy, and\nachieves up to 5.8% improvement in accuracy under non-IID scenarios compared to\nthe state-of-the-art baselines.\n","authors":["Yang Xu","Yunming Liao","Hongli Xu","Zhipeng Sun","Liusheng Huang","Chunming Qiao"],"pdf_url":"https://arxiv.org/pdf/2307.15870v4.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2407.16646v1","updated":"2024-07-23T17:00:09Z","published":"2024-07-23T17:00:09Z","title":"ExaWorks Software Development Kit: A Robust and Scalable Collection of\n Interoperable Workflow Technologies","summary":" Scientific discovery increasingly requires executing heterogeneous scientific\nworkflows on high-performance computing (HPC) platforms. Heterogeneous\nworkflows contain different types of tasks (e.g., simulation, analysis, and\nlearning) that need to be mapped, scheduled, and launched on different\ncomputing. That requires a software stack that enables users to code their\nworkflows and automate resource management and workflow execution. Currently,\nthere are many workflow technologies with diverse levels of robustness and\ncapabilities, and users face difficult choices of software that can effectively\nand efficiently support their use cases on HPC machines, especially when\nconsidering the latest exascale platforms. We contributed to addressing this\nissue by developing the ExaWorks Software Development Kit (SDK). The SDK is a\ncurated collection of workflow technologies engineered following current best\npractices and specifically designed to work on HPC platforms. We present our\nexperience with (1) curating those technologies, (2) integrating them to\nprovide users with new capabilities, (3) developing a continuous integration\nplatform to test the SDK on DOE HPC platforms, (4) designing a dashboard to\npublish the results of those tests, and (5) devising an innovative\ndocumentation platform to help users to use those technologies. Our experience\ndetails the requirements and the best practices needed to curate workflow\ntechnologies, and it also serves as a blueprint for the capabilities and\nservices that DOE will have to offer to support a variety of scientific\nheterogeneous workflows on the newly available exascale HPC platforms.\n","authors":["Matteo Turilli","Mihael Hategan-Marandiuc","Mikhail Titov","Ketan Maheshwari","Aymen Alsaadi","Andre Merzky","Ramon Arambula","Mikhail Zakharchanka","Matt Cowan","Justin M. Wozniak","Andreas Wilke","Ozgur Ozan Kilic","Kyle Chard","Rafael Ferreira da Silva","Shantenu Jha","Daniel Laney"],"pdf_url":"https://arxiv.org/pdf/2407.16646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16560v1","updated":"2024-07-23T15:14:39Z","published":"2024-07-23T15:14:39Z","title":"COALA: A Practical and Vision-Centric Federated Learning Platform","summary":" We present COALA, a vision-centric Federated Learning (FL) platform, and a\nsuite of benchmarks for practical FL scenarios, which we categorize into three\nlevels: task, data, and model. At the task level, COALA extends support from\nsimple classification to 15 computer vision tasks, including object detection,\nsegmentation, pose estimation, and more. It also facilitates federated\nmultiple-task learning, allowing clients to tackle multiple tasks\nsimultaneously. At the data level, COALA goes beyond supervised FL to benchmark\nboth semi-supervised FL and unsupervised FL. It also benchmarks feature\ndistribution shifts other than commonly considered label distribution shifts.\nIn addition to dealing with static data, it supports federated continual\nlearning for continuously changing data in real-world scenarios. At the model\nlevel, COALA benchmarks FL with split models and different models in different\nclients. COALA platform offers three degrees of customization for these\npractical FL scenarios, including configuration customization, components\ncustomization, and workflow customization. We conduct systematic benchmarking\nexperiments for the practical FL scenarios and highlight potential\nopportunities for further advancements in FL. Codes are open sourced at\nhttps://github.com/SonyResearch/COALA.\n","authors":["Weiming Zhuang","Jian Xu","Chen Chen","Jingtao Li","Lingjuan Lyu"],"pdf_url":"https://arxiv.org/pdf/2407.16560v1.pdf","comment":"ICML'24"},{"id":"http://arxiv.org/abs/2401.17460v2","updated":"2024-07-23T15:14:08Z","published":"2024-01-30T21:46:09Z","title":"Rendering Wireless Environments Useful for Gradient Estimators: A\n Zero-Order Stochastic Federated Learning Method","summary":" Cross-device federated learning (FL) is a growing machine learning setting\nwhereby multiple edge devices collaborate to train a model without disclosing\ntheir raw data. With the great number of mobile devices participating in more\nFL applications via the wireless environment, the practical implementation of\nthese applications will be hindered due to the limited uplink capacity of\ndevices, causing critical bottlenecks. In this work, we propose a novel doubly\ncommunication-efficient zero-order (ZO) method with a one-point gradient\nestimator that replaces communicating long vectors with scalar values and that\nharnesses the nature of the wireless communication channel, overcoming the need\nto know the channel state coefficient. It is the first method that includes the\nwireless channel in the learning algorithm itself instead of wasting resources\nto analyze it and remove its impact. We then offer a thorough analysis of the\nproposed zero-order federated learning (ZOFL) framework and prove that our\nmethod converges \\textit{almost surely}, which is a novel result in nonconvex\nZO optimization. We further prove a convergence rate of\n$O(\\frac{1}{\\sqrt[3]{K}})$ in the nonconvex setting. We finally demonstrate the\npotential of our algorithm with experimental results.\n","authors":["Elissa Mhanna","Mohamad Assaad"],"pdf_url":"https://arxiv.org/pdf/2401.17460v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16487v1","updated":"2024-07-23T14:01:12Z","published":"2024-07-23T14:01:12Z","title":"DRAM Errors and Cosmic Rays: Space Invaders or Science Fiction?","summary":" It is widely accepted that cosmic rays are a plausible cause of DRAM errors\nin high-performance computing (HPC) systems, and various studies suggest that\nthey could explain some aspects of the observed DRAM error behavior. However,\nthis phenomenon is insufficiently studied in production environments. We\nanalyze the correlations between cosmic rays and DRAM errors on two HPC\nclusters: a production supercomputer with server-class DDR3-1600 and a\nprototype with LPDDR3-1600 and no hardware error correction. Our error logs\ncover 2000 billion MB-hours for the MareNostrum 3 supercomputer and 135 million\nMB-hours for the Mont-Blanc prototype. Our analysis combines quantitative\nanalysis, formal statistical methods and machine learning. We detect no\nindications that cosmic rays have any influence on the DRAM errors. To\nunderstand whether the findings are specific to systems under study, located at\n100 meters above the sea level, the analysis should be repeated on other HPC\nclusters, especially the ones located on higher altitudes. Also, analysis can\n(and should) be applied to revisit and extend numerous previous studies which\nuse cosmic rays as a hypothetical explanation for some aspects of the observed\nDRAM error behaviors.\n","authors":["Isaac Boixaderas","Jorge Amaya","Sergi Moré","Javier Bartolome","David Vicente","Osman Unsal","Dimitris Gizopoulos","Paul M. Carpenter","Petar Radojković","Eduard Ayguadé"],"pdf_url":"https://arxiv.org/pdf/2407.16487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16395v1","updated":"2024-07-23T11:35:24Z","published":"2024-07-23T11:35:24Z","title":"Prisec II -- A Comprehensive Model for IoT Security: Cryptographic\n Algorithms and Cloud Integration","summary":" This study addresses the critical issue of ensuring data security and\nefficiency in interconnected devices, especially in IoT environments. The\nobjective is to design and implement a model using cryptographic algorithms to\nenhance data security in 5G networks. Challenges arise from the limited\ncomputational capabilities of IoT devices, which require the analysis and\nselection of cryptographic algorithms to achieve efficient data transmission.\nThis study proposes a model that includes four levels of security, each\nemploying different levels of encryption to provide better data security.\nFinally, cloud computing optimizes processing efficiency and resource\nutilization to improve data transmission.\n","authors":["Pedro Costa","Valderi Leithardt"],"pdf_url":"https://arxiv.org/pdf/2407.16395v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2407.16377v1","updated":"2024-07-23T11:04:33Z","published":"2024-07-23T11:04:33Z","title":"Reinforcement Learning-based Adaptive Mitigation of Uncorrected DRAM\n Errors in the Field","summary":" Scaling to larger systems, with current levels of reliability, requires\ncost-effective methods to mitigate hardware failures. One of the main causes of\nhardware failure is an uncorrected error in memory, which terminates the\ncurrent job and wastes all computation since the last checkpoint. This paper\npresents the first adaptive method for triggering uncorrected error mitigation.\nIt uses a prediction approach that considers the likelihood of an uncorrected\nerror and its current potential cost. The method is based on reinforcement\nlearning, and the only user-defined parameters are the mitigation cost and\nwhether the job can be restarted from a mitigation point. We evaluate our\nmethod using classical machine learning metrics together with a cost-benefit\nanalysis, which compares the cost of mitigation actions with the benefits from\nmitigating some of the errors. On two years of production logs from the\nMareNostrum supercomputer, our method reduces lost compute time by 54% compared\nwith no mitigation and is just 6% below the optimal Oracle method. All source\ncode is open source.\n","authors":["Isaac Boixaderas","Sergi Moré","Javier Bartolome","David Vicente","Petar Radojković","Paul M. Carpenter","Eduard Ayguadé"],"pdf_url":"https://arxiv.org/pdf/2407.16377v1.pdf","comment":"Published in HPDC'24"},{"id":"http://arxiv.org/abs/2407.16353v1","updated":"2024-07-23T09:54:07Z","published":"2024-07-23T09:54:07Z","title":"Sizey: Memory-Efficient Execution of Scientific Workflow Tasks","summary":" As the amount of available data continues to grow in fields as diverse as\nbioinformatics, physics, and remote sensing, the importance of scientific\nworkflows in the design and implementation of reproducible data analysis\npipelines increases. When developing workflows, resource requirements must be\ndefined for each type of task in the workflow. Typically, task types vary\nwidely in their computational demands because they are simply wrappers for\narbitrary black-box analysis tools. Furthermore, the resource consumption for\nthe same task type can vary considerably as well due to different inputs. Since\nunderestimating memory resources leads to bottlenecks and task failures,\nworkflow developers tend to overestimate memory resources. However,\noverprovisioning of memory wastes resources and limits cluster throughput.\n Addressing this problem, we propose Sizey, a novel online memory prediction\nmethod for workflow tasks. During workflow execution, Sizey simultaneously\ntrains multiple machine learning models and then dynamically selects the best\nmodel for each workflow task. To evaluate the quality of the model, we\nintroduce a novel resource allocation quality (RAQ) score based on memory\nprediction accuracy and efficiency. Sizey's prediction models are retrained and\nre-evaluated online during workflow execution, continuously incorporating\nmetrics from completed tasks.\n Our evaluation with a prototype implementation of Sizey uses metrics from six\nreal-world scientific workflows from the popular nf-core framework and shows a\nmedian reduction in memory waste over time of 24.68% compared to the respective\nbest-performing state-of-the-art baseline.\n","authors":["Jonathan Bader","Fabian Skalski","Fabian Lehmann","Dominik Scheinert","Jonathan Will","Lauritz Thamsen","Odej Kao"],"pdf_url":"https://arxiv.org/pdf/2407.16353v1.pdf","comment":"Paper accepted in 2024 IEEE International Conference on Cluster\n Computing (CLUSTER)"},{"id":"http://arxiv.org/abs/2308.08634v3","updated":"2024-07-23T09:42:29Z","published":"2023-08-16T19:14:52Z","title":"FedPop: Federated Population-based Hyperparameter Tuning","summary":" Federated Learning (FL) is a distributed machine learning (ML) paradigm, in\nwhich multiple clients collaboratively train ML models without centralizing\ntheir local data. Similar to conventional ML pipelines, the client local\noptimization and server aggregation procedure in FL are sensitive to the\nhyperparameter (HP) selection. Despite extensive research on tuning HPs for\ncentralized ML, these methods yield suboptimal results when employed in FL.\nThis is mainly because their \"training-after-tuning\" framework is unsuitable\nfor FL with limited client computation power. While some approaches have been\nproposed for HP-Tuning in FL, they are limited to the HPs for client local\nupdates. In this work, we propose a novel HP-tuning algorithm, called Federated\nPopulation-based Hyperparameter Tuning (FedPop), to address this vital yet\nchallenging problem. FedPop employs population-based evolutionary algorithms to\noptimize the HPs, which accommodates various HP types at both the client and\nserver sides. Compared with prior tuning methods, FedPop employs an online\n\"tuning-while-training\" framework, offering computational efficiency and\nenabling the exploration of a broader HP search space. Our empirical validation\non the common FL benchmarks and complex real-world FL datasets, including\nfull-sized Non-IID ImageNet-1K, demonstrates the effectiveness of the proposed\nmethod, which substantially outperforms the concurrent state-of-the-art\nHP-tuning methods in FL.\n","authors":["Haokun Chen","Denis Krompass","Jindong Gu","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2308.08634v3.pdf","comment":"Code: https://github.com/HaokunChen245/FedPop"},{"id":"http://arxiv.org/abs/2407.16300v1","updated":"2024-07-23T08:55:10Z","published":"2024-07-23T08:55:10Z","title":"A Programming Model for Disaggregated Memory over CXL","summary":" CXL (Compute Express Link) is an emerging open industry-standard interconnect\nbetween processing and memory devices that is expected to revolutionize the way\nsystems are designed in the near future. It enables cache-coherent shared\nmemory pools in a disaggregated fashion at unprecedented scales, allowing\nalgorithms to interact with a variety of storage devices using simple loads and\nstores in a cacheline granularity. Alongside with unleashing unique\nopportunities for a wide range of applications, CXL introduces new challenges\nof data management and crash consistency. Alas, CXL lacks an adequate\nprogramming model, which makes reasoning about the correctness and expected\nbehaviors of algorithms and systems on top of it nearly impossible.\n In this work, we present CXL0, the first programming model for concurrent\nprograms running on top of CXL. We propose a high-level abstraction for CXL\nmemory accesses and formally define operational semantics on top of that\nabstraction. We provide a set of general transformations that adapt concurrent\nalgorithms to the new disruptive technology. Using these transformations, every\nlinearizable algorithm can be easily transformed into its provably correct\nversion in the face of a full-system or sub-system crash. We believe that this\nwork will serve as the stepping stone for systems design and modelling on top\nof CXL, and support the development of future models as software and hardware\nevolve.\n","authors":["Gal Assa","Michal Friedman","Ori Lahav"],"pdf_url":"https://arxiv.org/pdf/2407.16300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05723v3","updated":"2024-07-23T00:06:34Z","published":"2023-11-09T20:14:48Z","title":"Active Admission Control in a P2P Distributed Environment for Capacity\n Efficient Livestreaming in Mobile Wireless Networks","summary":" In this study, the Active Control in an Intelligent and Distributed\nEnvironment (ACIDE) media distribution model solution and algorithms are\nproposed for livestreaming in capacity efficient mobile wireless networks. The\nelements of the ACIDE model are a base station and a cluster formed by a number\nof peers able to establish peer to peer communications. The cluster peers are\nselected from a group of users interested in livestreaming the same media. The\nACIDE model solution minimizes the bandwidth allocated to a cluster of n peers\nsuch that an uninterrupted media play for all peers is guaranteed. The\nlivestream media is sent to the peers in packages and every media package is\ndivided into n blocks. The blocks are distributed to the n peers of a cluster\nin two phases, such that the base station bandwidth is utilized during first\nphase only. The allocated bandwidth, the amount of bandwidth the base station\nhas to allocate to a cluster, is minimized and its lower bound is equal to the\nbandwidth required for multicasting. In this study, the ACIDE model is used to\naddress the problem of how to find the maximum number of peers n, chosen from a\ngroup of N users, that can be admitted to a cluster knowing the given allocated\nbandwidth, the amount of bandwidth that a base station allocates to a cluster\nin advance, prior to admitting users. When users become peers of an ACIDE\ncluster, the network capacity, the total number of users who are able to access\nlive media, increases meaning that network resources are used more efficiently.\nThe problem of finding the maximum number of peers n is addressed as an\noptimization problem, with the objective of having the entire given allocated\nbandwidth used by the peers admitted to the cluster. This problem is\nNP-complete and a non-optimal solution is proposed for peers selection such\nthat all admitted peers play media continuously.\n","authors":["Andrei Negulescu","Weijia Shang"],"pdf_url":"https://arxiv.org/pdf/2311.05723v3.pdf","comment":"8 pages, 6 figures, 3 tables; Accepted for publication in:\n Proceedings of the 2023 International Conference on Computational Science and\n Computational Intelligence (CSCI'23: December 13-15, 2023, Las Vegas, Nevada,\n USA); Publisher: IEEE Computer Society (CPS)"},{"id":"http://arxiv.org/abs/2407.16836v1","updated":"2024-07-23T21:01:12Z","published":"2024-07-23T21:01:12Z","title":"Inference Load-Aware Orchestration for Hierarchical Federated Learning","summary":" Hierarchical federated learning (HFL) designs introduce intermediate\naggregator nodes between clients and the global federated learning server in\norder to reduce communication costs and distribute server load. One side effect\nis that machine learning model replication at scale comes \"for free\" as part of\nthe HFL process: model replicas are hosted at the client end, intermediate\nnodes, and the global server level and are readily available for serving\ninference requests. This creates opportunities for efficient model serving but\nsimultaneously couples the training and serving processes and calls for their\njoint orchestration. This is particularly important for continual learning,\nwhere serving a model while (re)training it periodically, upon specific\ntriggers, or continuously, takes place over shared infrastructure spanning the\ncomputing continuum. Consequently, training and inference workloads can\ninterfere with detrimental effects on performance. To address this issue, we\npropose an inference load-aware HFL orchestration scheme, which makes informed\ndecisions on HFL configuration, considering knowledge about inference workloads\nand the respective processing capacity. Applying our scheme to a continual\nlearning use case in the transportation domain, we demonstrate that by\noptimizing aggregator node placement and device-aggregator association,\nsignificant inference latency savings can be achieved while communication costs\nare drastically reduced compared to flat centralized federated learning.\n","authors":["Anna Lackinger","Pantelis A. Frangoudis","Ivan Čilić","Alireza Furutanpey","Ilir Murturi","Ivana Podnar Žarko","Schahram Dustdar"],"pdf_url":"https://arxiv.org/pdf/2407.16836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01372v4","updated":"2024-07-23T20:40:16Z","published":"2022-12-02T18:54:30Z","title":"Refined Bitcoin Security-Latency Under Network Delay","summary":" We study security-latency bounds for Nakamoto consensus, i.e., how secure a\nblock is after it becomes $k$-deep in the chain. We improve the\nstate-of-the-art bounds by analyzing the race between adversarial and honest\nchains in three different phases. We find the probability distribution of the\ngrowth of the adversarial chains under models similar to those in [Guo, Ren;\nAFT 2022] when a target block becomes $k$-deep in the chain. We analyze certain\nproperties of this race to model each phase with random walks that provide\ntighter bounds than the existing results. Combining all three phases provides\nnovel upper and lower bounds for blockchains with small $\\lambda\\Delta$.\n","authors":["Mustafa Doger","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2212.01372v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03709v3","updated":"2024-07-23T18:38:01Z","published":"2024-03-06T13:48:44Z","title":"Portable, heterogeneous ensemble workflows at scale using libEnsemble","summary":" libEnsemble is a Python-based toolkit for running dynamic ensembles,\ndeveloped as part of the DOE Exascale Computing Project. The toolkit utilizes a\nunique generator--simulator--allocator paradigm, where generators produce input\nfor simulators, simulators evaluate those inputs, and allocators decide whether\nand when a simulator or generator should be called. The generator steers the\nensemble based on simulation results. Generators may, for example, apply\nmethods for numerical optimization, machine learning, or statistical\ncalibration. libEnsemble communicates between a manager and workers. We\noverview the unique characteristics of libEnsemble as well as current and\npotential interoperability with other packages in the workflow ecosystem. We\nhighlight libEnsemble's dynamic resource features: libEnsemble can detect\nsystem resources, such as available nodes, cores, and GPUs, and assign these in\na portable way. These features allow users to specify the number of processors\nand GPUs required for each simulation; and resources will be automatically\nassigned on a wide range of systems, including Frontier, Aurora, and\nPerlmutter. Such ensembles can include multiple simulation types, some using\nGPUs and others using only CPUs, sharing nodes for maximum efficiency. We also\ndescribe the benefits of libEnsemble's generator--simulator coupling, which\neasily exposes to the user the ability to cancel, and portably kill, running\nsimulations based on models that are updated with intermediate simulation\noutput. We demonstrate libEnsemble's capabilities, scalability, and scientific\nimpact via a Gaussian process surrogate training problem for the longitudinal\ndensity profile at the exit of a plasma accelerator stage. The study uses gpCAM\nfor the surrogate model and employs either Wake-T or WarpX simulations,\nhighlighting efficient use of resources that can easily extend to exascale.\n","authors":["Stephen Hudson","Jeffrey Larson","John-Luke Navarro","Stefan M. Wild"],"pdf_url":"https://arxiv.org/pdf/2403.03709v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16728v1","updated":"2024-07-23T14:41:32Z","published":"2024-07-23T14:41:32Z","title":"Distributed Difference of Convex Optimization","summary":" In this article, we focus on solving a class of distributed optimization\nproblems involving $n$ agents with the local objective function at every agent\n$i$ given by the difference of two convex functions $f_i$ and $g_i$\n(difference-of-convex (DC) form), where $f_i$ and $g_i$ are potentially\nnonsmooth. The agents communicate via a directed graph containing $n$ nodes. We\ncreate smooth approximations of the functions $f_i$ and $g_i$ and develop a\ndistributed algorithm utilizing the gradients of the smooth surrogates and a\nfinite-time approximate consensus protocol. We term this algorithm as\nDDC-Consensus. The developed DDC-Consensus algorithm allows for non-symmetric\ndirected graph topologies and can be synthesized distributively. We establish\nthat the DDC-Consensus algorithm converges to a stationary point of the\nnonconvex distributed optimization problem. The performance of the\nDDC-Consensus algorithm is evaluated via a simulation study to solve a\nnonconvex DC-regularized distributed least squares problem. The numerical\nresults corroborate the efficacy of the proposed algorithm.\n","authors":["Vivek Khatana","Murti V. Salapaka"],"pdf_url":"https://arxiv.org/pdf/2407.16728v1.pdf","comment":"9 pages, 7 figures"}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2407.16237v1","updated":"2024-07-23T07:22:25Z","published":"2024-07-23T07:22:25Z","title":"OriGen:Enhancing RTL Code Generation with Code-to-Code Augmentation and\n Self-Reflection","summary":" Recent studies have illuminated that Large Language Models (LLMs) exhibit\nsubstantial potential in the realm of RTL (Register Transfer Level) code\ngeneration, with notable advancements evidenced by commercial models such as\nGPT-4 and Claude3-Opus. Despite their proficiency, these commercial LLMs often\nraise concerns regarding privacy and security. Conversely, open-source LLMs,\nwhich offer solutions to these concerns, have inferior performance in RTL code\ngeneration tasks to commercial models due to the lack of highquality\nopen-source RTL datasets. To address this issue, we introduce OriGen, a fully\nopen-source framework featuring self-reflection capabilities and a dataset\naugmentation methodology for generating high-quality, large-scale RTL code. We\npropose a novel code-to-code augmentation methodology that leverages knowledge\ndistillation to enhance the quality of the open-source RTL code datasets.\nAdditionally, OriGen is capable of correcting syntactic errors by leveraging a\nself-reflection process based on feedback from the compiler. The\nself-reflection ability of the model is facilitated by a carefully constructed\ndataset, which comprises a comprehensive collection of samples. Experimental\nresults demonstrate that OriGen remarkably outperforms other open-source\nalternatives in RTL code generation, surpassing the previous best-performing\nLLM by 9.8% on the VerilogEval-Human benchmark. Furthermore, OriGen exhibits\nsuperior capabilities in self-reflection and error rectification, surpassing\nGPT-4 by 18.1% on the benchmark designed to evaluate the capability of\nself-reflection.\n","authors":["Fan Cui","Chenyang Yin","Kexing Zhou","Youwei Xiao","Guangyu Sun","Qiang Xu","Qipeng Guo","Demin Song","Dahua Lin","Xingcheng Zhang"," Yun"," Liang"],"pdf_url":"https://arxiv.org/pdf/2407.16237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18276v1","updated":"2024-07-23T21:18:31Z","published":"2024-07-23T21:18:31Z","title":"Rome was Not Built in a Single Step: Hierarchical Prompting for\n LLM-based Chip Design","summary":" Large Language Models (LLMs) are effective in computer hardware synthesis via\nhardware description language (HDL) generation. However, LLM-assisted\napproaches for HDL generation struggle when handling complex tasks. We\nintroduce a suite of hierarchical prompting techniques which facilitate\nefficient stepwise design methods, and develop a generalizable automation\npipeline for the process. To evaluate these techniques, we present a benchmark\nset of hardware designs which have solutions with or without architectural\nhierarchy. Using these benchmarks, we compare various open-source and\nproprietary LLMs, including our own fine-tuned Code Llama-Verilog model. Our\nhierarchical methods automatically produce successful designs for complex\nhardware modules that standard flat prompting methods cannot achieve, allowing\nsmaller open-source LLMs to compete with large proprietary models. Hierarchical\nprompting reduces HDL generation time and yields savings on LLM costs. Our\nexperiments detail which LLMs are capable of which applications, and how to\napply hierarchical methods in various modes. We explore case studies of\ngenerating complex cores using automatic scripted hierarchical prompts,\nincluding the first-ever LLM-designed processor with no human feedback.\n","authors":["Andre Nakkab","Sai Qian Zhang","Ramesh Karri","Siddharth Garg"],"pdf_url":"https://arxiv.org/pdf/2407.18276v1.pdf","comment":"Accepted at MLCAD '24. 10 pages, 7 figures, 5 tables"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2407.16504v1","updated":"2024-07-23T14:21:27Z","published":"2024-07-23T14:21:27Z","title":"Language-Based Security for Low-Level MPC","summary":" Secure Multi-Party Computation (MPC) is an important enabling technology for\ndata privacy in modern distributed applications. Currently, proof methods for\nlow-level MPC protocols are primarily manual and thus tedious and error-prone,\nand are also non-standardized and unfamiliar to most PL theorists. As a step\ntowards better language support and language-based enforcement, we develop a\nnew staged PL for defining a variety of low-level probabilistic MPC protocols.\nWe also formulate a collection of confidentiality and integrity hyperproperties\nfor our language model that are familiar from information flow, including\nconditional noninterference, gradual release, and robust declassification. We\ndemonstrate their relation to standard MPC threat models of passive and\nmalicious security, and how they can be leveraged in security verification of\nprotocols. To prove these properties we develop automated tactics in\n$\\mathbb{F}_2$ that can be integrated with separation logic-style reasoning.\n","authors":["Christian Skalka","Joseph P. Near"],"pdf_url":"https://arxiv.org/pdf/2407.16504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11919v2","updated":"2024-07-23T13:18:36Z","published":"2024-03-18T16:19:15Z","title":"A Coq Mechanization of JavaScript Regular Expression Semantics","summary":" We present an executable, proven-safe, faithful, and future-proof Coq\nmechanization of JavaScript regular expression (regex) matching, as specified\nby the last published edition of ECMA-262 section 22.2. This is, to our\nknowledge, the first time that an industrial-strength regex language has been\nfaithfully mechanized in an interactive theorem prover. We highlight\ninteresting challenges that arose in the process (including issues of encoding,\ncorner cases, and executability), and we document the steps that we took to\nensure that the result is straightforwardly auditable and that our\nunderstanding of the spec aligns with existing implementations.\n We demonstrate the usability and versatility of the mechanization through a\nbroad collection of analyses, case studies, and experiments: we prove that\nJavaScript regex matching always terminates and is safe (no assertion\nfailures); we identifying subtle corner cases that led to mistakes in previous\npublications; we verify an optimization extracted from a state-of-the-art regex\nengine; we show that some classic properties described in automata textbooks\nand used in derivatives-based matchers do not hold in JavaScript regexes; and\nwe demonstrate that the cost of updating the mechanization to account for\nchanges in the original specification is reasonably low.\n Our mechanization can be extracted to OCaml and linked with Unicode libraries\nto produce an executable engine that passes the relevant parts of the official\nTest262 conformance test suite.\n","authors":["Noé De Santo","Aurèle Barrière","Clément Pit-Claudel"],"pdf_url":"https://arxiv.org/pdf/2403.11919v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17620v2","updated":"2024-07-23T12:56:19Z","published":"2023-11-29T13:28:30Z","title":"Linear Matching of JavaScript Regular Expressions","summary":" Modern regex languages have strayed far from well-understood traditional\nregular expressions: they include features that fundamentally transform the\nmatching problem. In exchange for these features, modern regex engines at times\nsuffer from exponential complexity blowups, a frequent source of\ndenial-of-service vulnerabilities in JavaScript applications. Worse, regex\nsemantics differ across languages, and the impact of these divergences on\nalgorithmic design and worst-case matching complexity has seldom been\ninvestigated.\n This paper provides a novel perspective on JavaScript's regex semantics by\nidentifying a larger-than-previously-understood subset of the language that can\nbe matched with linear time guarantees. In the process, we discover several\ncases where state-of-the-art algorithms were either wrong (semantically\nincorrect), inefficient (suffering from superlinear complexity) or excessively\nrestrictive (assuming certain features could not be matched linearly). We\nintroduce novel algorithms to restore correctness and linear complexity. We\nfurther advance the state-of-the-art in linear regex matching by presenting the\nfirst nonbacktracking algorithms for matching lookarounds in linear time: one\nsupporting captureless lookbehinds in any regex language, and another\nleveraging a JavaScript property to support unrestricted lookaheads and\nlookbehinds. Finally, we describe new time and space complexity tradeoffs for\nregex engines. All of our algorithms are practical: we validated them in a\nprototype implementation, and some have also been merged in the V8 JavaScript\nimplementation used in Chrome and Node.js.\n","authors":["Aurèle Barrière","Clément Pit-Claudel"],"pdf_url":"https://arxiv.org/pdf/2311.17620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14347v3","updated":"2024-07-23T02:50:29Z","published":"2023-11-24T08:48:00Z","title":"Typed compositional quantum computation with lenses","summary":" We propose a type-theoretic framework for describing and proving properties\nof quantum computations, in particular those presented as quantum circuits. Our\nproposal is based on an observation that, in the polymorphic type system of\nCoq, currying on quantum states allows us to apply quantum gates directly\ninside a complex circuit. By introducing a discrete notion of lens to control\nthis currying, we are further able to separate the combinatorics of the circuit\nstructure from the computational content of gates. We apply our development to\ndefine quantum circuits recursively from the bottom up, and prove their\ncorrectness compositionally.\n","authors":["Jacques Garrigue","Takafumi Saikawa"],"pdf_url":"https://arxiv.org/pdf/2311.14347v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04604v3","updated":"2024-07-23T18:26:32Z","published":"2024-06-07T03:27:51Z","title":"Learning Task Decomposition to Assist Humans in Competitive Programming","summary":" When using language models (LMs) to solve complex problems, humans might\nstruggle to understand the LM-generated solutions and repair the flawed ones.\nTo assist humans in repairing them, we propose to automatically decompose\ncomplex solutions into multiple simpler pieces that correspond to specific\nsubtasks. We introduce a novel objective for learning task decomposition,\ntermed assistive value (AssistV), which measures the feasibility and speed for\nhumans to repair the decomposed solution. We collect a dataset of human repair\nexperiences on different decomposed solutions. Utilizing the collected data as\nin-context examples, we then learn to critique, refine, and rank decomposed\nsolutions to improve AssistV. We validate our method under competitive\nprogramming problems: under 177 hours of human study, our method enables\nnon-experts to solve 33.3\\% more problems, speeds them up by 3.3x, and empowers\nthem to match unassisted experts.\n","authors":["Jiaxin Wen","Ruiqi Zhong","Pei Ke","Zhihong Shao","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2406.04604v3.pdf","comment":"ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2407.16847v1","updated":"2024-07-23T21:18:07Z","published":"2024-07-23T21:18:07Z","title":"SPLAT: A framework for optimised GPU code-generation for SParse reguLar\n ATtention","summary":" Multi-head-self-attention (MHSA) mechanisms achieve state-of-the-art (SOTA)\nperformance across natural language processing and vision tasks. However, their\nquadratic dependence on sequence lengths has bottlenecked inference speeds. To\ncircumvent this bottleneck, researchers have proposed various sparse-MHSA\nmodels, where a subset of full attention is computed. Despite their promise,\ncurrent sparse libraries and compilers do not support high-performance\nimplementations for diverse sparse-MHSA patterns due to the underlying sparse\nformats they operate on. These formats, which are typically designed for\nhigh-performance & scientific computing applications, are either curated for\nextreme amounts of random sparsity (<1% non-zero values), or specific sparsity\npatterns. However, the sparsity patterns in sparse-MHSA are moderately sparse\n(10-50% non-zero values) and varied, resulting in existing sparse-formats\ntrading off generality for performance.\n We bridge this gap, achieving both generality and performance, by proposing a\nnovel sparse format: affine-compressed-sparse-row (ACSR) and supporting\ncode-generation scheme, SPLAT, that generates high-performance implementations\nfor diverse sparse-MHSA patterns on GPUs. Core to our proposed format and code\ngeneration algorithm is the observation that common sparse-MHSA patterns have\nuniquely regular geometric properties. These properties, which can be analyzed\njust-in-time, expose novel optimizations and tiling strategies that SPLAT\nexploits to generate high-performance implementations for diverse patterns. To\ndemonstrate SPLAT's efficacy, we use it to generate code for various\nsparse-MHSA models, achieving geomean speedups of 2.05x and 4.05x over\nhand-written kernels written in triton and TVM respectively on A100 GPUs.\nMoreover, its interfaces are intuitive and easy to use with existing\nimplementations of MHSA in JAX.\n","authors":["Ahan Gupta","Yueming Yuan","Devansh Jain","Yuhao Ge","David Aponte","Yanqi Zhou","Charith Mendis"],"pdf_url":"https://arxiv.org/pdf/2407.16847v1.pdf","comment":"31 pages, 16 figures"},{"id":"http://arxiv.org/abs/2407.16801v1","updated":"2024-07-23T19:02:55Z","published":"2024-07-23T19:02:55Z","title":"Qudit Quantum Programming with Projective Cliffords","summary":" This paper introduces a novel abstraction for programming quantum operations,\nspecifically projective Cliffords, as functions over the qudit Pauli group. We\ndefine a categorical semantics for projective Cliffords based on Pauli\nencodings in terms of $\\mathbb{Z}_d$-linear maps. We then introduce a type\nsystem and lambda calculus for both $\\mathbb{Z}_d$-linear maps and projective\nCliffords, and prove that these type systems have a sound denotational\nsemantics in terms of the relevant categories. Finally, we explore what it\nmeans to program with projective Cliffords through a number of examples and\nprogramming constructions.\n","authors":["Jennifer Paykin","Sam Winnick"],"pdf_url":"https://arxiv.org/pdf/2407.16801v1.pdf","comment":"42 pages"}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2311.05723v3","updated":"2024-07-23T00:06:34Z","published":"2023-11-09T20:14:48Z","title":"Active Admission Control in a P2P Distributed Environment for Capacity\n Efficient Livestreaming in Mobile Wireless Networks","summary":" In this study, the Active Control in an Intelligent and Distributed\nEnvironment (ACIDE) media distribution model solution and algorithms are\nproposed for livestreaming in capacity efficient mobile wireless networks. The\nelements of the ACIDE model are a base station and a cluster formed by a number\nof peers able to establish peer to peer communications. The cluster peers are\nselected from a group of users interested in livestreaming the same media. The\nACIDE model solution minimizes the bandwidth allocated to a cluster of n peers\nsuch that an uninterrupted media play for all peers is guaranteed. The\nlivestream media is sent to the peers in packages and every media package is\ndivided into n blocks. The blocks are distributed to the n peers of a cluster\nin two phases, such that the base station bandwidth is utilized during first\nphase only. The allocated bandwidth, the amount of bandwidth the base station\nhas to allocate to a cluster, is minimized and its lower bound is equal to the\nbandwidth required for multicasting. In this study, the ACIDE model is used to\naddress the problem of how to find the maximum number of peers n, chosen from a\ngroup of N users, that can be admitted to a cluster knowing the given allocated\nbandwidth, the amount of bandwidth that a base station allocates to a cluster\nin advance, prior to admitting users. When users become peers of an ACIDE\ncluster, the network capacity, the total number of users who are able to access\nlive media, increases meaning that network resources are used more efficiently.\nThe problem of finding the maximum number of peers n is addressed as an\noptimization problem, with the objective of having the entire given allocated\nbandwidth used by the peers admitted to the cluster. This problem is\nNP-complete and a non-optimal solution is proposed for peers selection such\nthat all admitted peers play media continuously.\n","authors":["Andrei Negulescu","Weijia Shang"],"pdf_url":"https://arxiv.org/pdf/2311.05723v3.pdf","comment":"8 pages, 6 figures, 3 tables; Accepted for publication in:\n Proceedings of the 2023 International Conference on Computational Science and\n Computational Intelligence (CSCI'23: December 13-15, 2023, Las Vegas, Nevada,\n USA); Publisher: IEEE Computer Society (CPS)"},{"id":"http://arxiv.org/abs/1902.08318v7","updated":"2024-07-23T21:56:05Z","published":"2019-02-22T00:24:01Z","title":"Parsing Gigabytes of JSON per Second","summary":" JavaScript Object Notation or JSON is a ubiquitous data exchange format on\nthe Web. Ingesting JSON documents can become a performance bottleneck due to\nthe sheer volume of data. We are thus motivated to make JSON parsing as fast as\npossible.\n Despite the maturity of the problem of JSON parsing, we show that substantial\nspeedups are possible. We present the first standard-compliant JSON parser to\nprocess gigabytes of data per second on a single core, using commodity\nprocessors. We can use a quarter or fewer instructions than a state-of-the-art\nreference parser like RapidJSON. Unlike other validating parsers, our software\n(simdjson) makes extensive use of Single Instruction, Multiple Data (SIMD)\ninstructions. To ensure reproducibility, simdjson is freely available as\nopen-source software under a liberal license.\n","authors":["Geoff Langdale","Daniel Lemire"],"pdf_url":"https://arxiv.org/pdf/1902.08318v7.pdf","comment":"software: https://github.com/lemire/simdjson"}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2407.16661v1","updated":"2024-07-23T17:25:02Z","published":"2024-07-23T17:25:02Z","title":"Regenerative Ulam-von Neumann Algorithm: An Innovative Markov chain\n Monte Carlo Method for Matrix Inversion","summary":" This paper presents an extension of the classical Ulan-von Neumann Markov\nchain Monte-Carlo algorithm for the computation of the matrix inverse. The\nalgorithm presented in this paper, termed as \\emph{regenerative Ulam-von\nNeumann algorithm}, utilizes the regenerative structure of classical,\nnon-truncated Neumann series defined by a non-singular matrix and produces an\nunbiased estimator of the matrix inverse. Furthermore, the accuracy of the\nproposed algorithm depends on a single parameter that controls the total number\nof Markov transitions simulated thus avoiding the challenge of balancing\nbetween the total number of Markov chain replications and its corresponding\nlength as in the classical Ulam-von Neumann algorithm. To efficiently utilize\nthe Markov chain transition samples in the calculation of the regenerative\nquantities, the proposed algorithm quantifies automatically the contribution of\neach Markov transition to all regenerative quantities by a carefully designed\nupdating scheme that utilized three separate matrices containing the current\nweights, total weights, and regenerative cycle count, respectively. A\nprobabilistic analysis of the performance of the algorithm, including the\nvariance of the estimator, is provided. Finally, numerical experiments verify\nthe qualitative effectiveness of the proposed scheme.\n","authors":["Soumyadip Ghosh","Lior Horesh","Vassilis Kalantzis","Yingdong Lu","Tomasz Nowicki"],"pdf_url":"https://arxiv.org/pdf/2407.16661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16597v1","updated":"2024-07-23T15:59:52Z","published":"2024-07-23T15:59:52Z","title":"Inference of rankings planted in random tournaments","summary":" We consider the problem of inferring an unknown ranking of $n$ items from a\nrandom tournament on $n$ vertices whose edge directions are correlated with the\nranking. We establish, in terms of the strength of these correlations, the\ncomputational and statistical thresholds for detection (deciding whether an\nobserved tournament is purely random or drawn correlated with a hidden ranking)\nand recovery (estimating the hidden ranking with small error in Spearman's\nfootrule or Kendall's tau metric on permutations). Notably, we find that this\nproblem provides a new instance of a detection-recovery gap: solving the\ndetection problem requires much weaker correlations than solving the recovery\nproblem. In establishing these thresholds, we also identify simple algorithms\nfor detection (thresholding a degree 2 polynomial) and recovery (outputting a\nranking by the number of \"wins\" of a tournament vertex, i.e., the out-degree)\nthat achieve optimal performance up to constants in the correlation strength.\nFor detection, we find that the above low-degree polynomial algorithm is\nsuperior to a natural spectral algorithm. We also find that, whenever it is\npossible to achieve strong recovery (i.e., to estimate with vanishing error in\nthe above metrics) of the hidden ranking, then the above \"Ranking By Wins\"\nalgorithm not only does so, but also outputs a close approximation of the\nmaximum likelihood estimator, a task that is NP-hard in the worst case.\n","authors":["Dmitriy Kunisky","Daniel A. Spielman","Xifan Yu"],"pdf_url":"https://arxiv.org/pdf/2407.16597v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2312.04360v2","updated":"2024-07-23T15:51:20Z","published":"2023-12-07T15:28:40Z","title":"The Computational Advantage of MIP* Vanishes in the Presence of Noise","summary":" Quantum multiprover interactive proof systems with entanglement MIP* are much\nmore powerful than its classical counterpart MIP (Babai et al. '91, Ji et al.\n'20): while MIP = NEXP, the quantum class MIP* is equal to RE, a class\nincluding the halting problem. This is because the provers in MIP* can share\nunbounded quantum entanglement. However, recent works of Qin and Yao '21 and\n'23 have shown that this advantage is significantly reduced if the provers'\nshared state contains noise. This paper attempts to exactly characterize the\neffect of noise on the computational power of quantum multiprover interactive\nproof systems. We investigate the quantum two-prover one-round interactive\nsystem MIP*[poly, O(1)], where the verifier sends polynomially many bits to the\nprovers and the provers send back constantly many bits. We show noise\ncompletely destroys the computational advantage given by shared entanglement in\nthis model. Specifically, we show that if the provers are allowed to share\narbitrarily many noisy EPR states, where each EPR state is affected by an\narbitrarily small constant amount of noise, the resulting complexity class is\nequivalent to NEXP = MIP. This improves significantly on the previous\nbest-known bound of NEEEXP (nondeterministic triply exponential time) by Qin\nand Yao '21. We also show that this collapse in power is due to the noise,\nrather than the O(1) answer size, by showing that allowing for noiseless EPR\nstates gives the class the full power of RE = MIP*[poly, poly]. Along the way,\nwe develop two technical tools of independent interest. First, we give a new,\ndeterministic tester for the positivity of an exponentially large matrix,\nprovided it has a low-degree Fourier decomposition in terms of Pauli matrices.\nSecondly, we develop a new invariance principle for smooth matrix functions\nhaving bounded third-order Fr\\'echet derivatives or which are Lipschitz\ncontinous.\n","authors":["Yangjing Dong","Honghao Fu","Anand Natarajan","Minglong Qin","Haochen Xu","Penghui Yao"],"pdf_url":"https://arxiv.org/pdf/2312.04360v2.pdf","comment":"V2, updated results. Comments are welcome!"},{"id":"http://arxiv.org/abs/2402.16541v3","updated":"2024-07-23T10:27:14Z","published":"2024-02-26T12:59:20Z","title":"Integer Programming Using A Single Atom","summary":" Integer programming (IP), as the name suggests is an integer-variable-based\napproach commonly used to formulate real-world optimization problems with\nconstraints. Currently, quantum algorithms reformulate the IP into an\nunconstrained form through the use of binary variables, which is an indirect\nand resource-consuming way of solving it. We develop an algorithm that maps and\nsolves an IP problem in its original form to any quantum system possessing a\nlarge number of accessible internal degrees of freedom that are controlled with\nsufficient accuracy. This work leverages the principle of superposition to\nsolve the optimization problem. Using a single Rydberg atom as an example, we\nassociate the integer values to electronic states belonging to different\nmanifolds and implement a selective superposition of different states to solve\nthe full IP problem. The optimal solution is found within a few microseconds\nfor prototypical IP problems with up to eight variables and four constraints.\nThis also includes non-linear IP problems, which are usually harder to solve\nwith classical algorithms when compared to their linear counterparts. Our\nalgorithm for solving IP is benchmarked by a well-known classical algorithm\n(branch and bound) in terms of the number of steps needed for convergence to\nthe solution. This approach carries the potential to improve the solutions\nobtained for larger-size problems using hybrid quantum-classical algorithms.\n","authors":["Kapil Goswami","Peter Schmelcher","Rick Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2402.16541v3.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.17001v2","updated":"2024-07-23T08:16:19Z","published":"2024-05-27T09:45:06Z","title":"Delta-modular ILP Problems of Bounded Co-dimension, Discrepancy, and\n Convolution","summary":" For $k, n \\geq 0$, and $c \\in Z^n$, we consider ILP problems \\begin{gather*}\n \\max\\bigl\\{ c^\\top x \\colon A x = b,\\, x \\in Z^n_{\\geq 0} \\bigr\\}\\text{ with\n$A \\in Z^{k \\times n}$, $rank(A) = k$, $b \\in Z^{k}$ and}\n \\max\\bigl\\{ c^\\top x \\colon A x \\leq b,\\, x \\in Z^n \\bigr\\} \\text{ with $A\n\\in Z^{(n+k) \\times n}$, $rank(A) = n$, $b \\in Z^{n+k}$.} \\end{gather*} The\nfirst problem is called an \\emph{ILP problem in the standard form of the\ncodimension $k$}, and the second problem is called an \\emph{ILP problem in the\ncanonical form with $n+k$ constraints.} We show that, for any sufficiently\nlarge $\\Delta$, both problems can be solved with $$ 2^{O(k)} \\cdot (f_{k,d}\n\\cdot \\Delta)^2 / 2^{\\Omega\\bigl(\\sqrt{\\log(f_{k,d} \\cdot \\Delta)}\\bigr)} $$\noperations, where $\n f_{k,d} = \\min \\Bigl\\{ k^{k/2},\n \\bigl(\\log k \\cdot \\log (d + k)\\bigr)^{k/2}\n \\Bigr\\} $, $d$ is the dimension of a corresponding polyhedron and $\\Delta$ is\nthe maximum absolute value of $rank(A) \\times rank(A)$ sub-determinants of $A$.\n As our second main result, we show that the feasibility variants of both\nproblems can be solved with $$ 2^{O(k)} \\cdot f_{k,d} \\cdot \\Delta \\cdot\n\\log^3(f_{k,d} \\cdot \\Delta) $$ operations. The constant $f_{k,d}$ can be\nreplaced by other constant $g_{k,\\Delta} = \\bigl(\\log k \\cdot \\log (k\n\\Delta)\\bigr)^{k/2}$ that depends only on $k$ and $\\Delta$. Additionally, we\nconsider different partial cases with $k=0$ and $k=1$, which have interesting\napplications.\n As a result of independent interest, we propose an\n$n^2/2^{\\Omega\\bigl(\\sqrt{\\log n}\\bigr)}$-time algorithm for the tropical\nconvolution problem on sequences, indexed by elements of a finite Abelian group\nof the order $n$. Additionally, we give a complete, self-contained error\nanalysis of the generalized Discrete Fourier Transform for Abelian groups with\nrespect to the Word-RAM computational model.\n","authors":["D. Gribanov","D. Malyshev","P. M. Pardalos"],"pdf_url":"https://arxiv.org/pdf/2405.17001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15884v3","updated":"2024-07-23T06:10:19Z","published":"2023-11-27T14:53:45Z","title":"Elementary Quantum Recursion Schemes That Capture Quantum\n Polylogarithmic Time Computability of Quantum Functions","summary":" Quantum computing has been studied over the past four decades based on two\ncomputational models of quantum circuits and quantum Turing machines. To\ncapture quantum polynomial-time computability, a new recursion-theoretic\napproach was taken lately by Yamakami [J. Symb. Logic 80, pp.~1546--1587, 2020]\nby way of recursion schematic definition, which constitutes six initial quantum\nfunctions and three construction schemes of composition, branching, and\nmulti-qubit quantum recursion. By taking a similar approach, we look into\nquantum polylogarithmic-time computability and further explore the expressing\npower of elementary schemes designed for such quantum computation. In\nparticular, we introduce an elementary form of the quantum recursion, called\nthe fast quantum recursion, and formulate $EQS$ (elementary quantum schemes) of\n``elementary'' quantum functions. This class $EQS$ captures exactly quantum\npolylogarithmic-time computability, which forms the complexity class\nBQPOLYLOGTIME. We also demonstrate the separation of BQPOLYLOGTIME from\nNLOGTIME and PPOLYLOGTIME. As a natural extension of $EQS$, we further consider\nan algorithmic procedural scheme that implements the well-known\ndivide-and-conquer strategy. This divide-and-conquer scheme helps compute the\nparity function but the scheme cannot be realized within our system $EQS$.\n","authors":["Tomoyuki Yamakami"],"pdf_url":"https://arxiv.org/pdf/2311.15884v3.pdf","comment":"(A4, 10pt, 29 pages) This is a corrected and expanded version of the\n preliminary report that has appeared, under a different title, in the\n Proceedings of the 28th International Conference on Logic, Language,\n Information, and Computation (WoLLIC 2022), Ia\\c{s}i, Romania, September\n 20--23, 2022, Lecture Notes in Computer Science, vol. 13468, pp. 88-104,\n Springer, 2022"},{"id":"http://arxiv.org/abs/2403.06580v2","updated":"2024-07-23T05:16:25Z","published":"2024-03-11T10:27:30Z","title":"Arborescences and Shortest Path Trees when Colors Matter","summary":" Color-constrained subgraph problems are those where we are given an\nedge-colored (directed or undirected) graph and the task is to find a specific\ntype of subgraph, like a spanning tree, an arborescence, a single-source\nshortest path tree, a perfect matching etc., with constraints on the number of\nedges of each color. Some of these problems, like color-constrained spanning\ntree, have elegant solutions and some of them, like color-constrained perfect\nmatching, are longstanding open questions. In this work, we study\ncolor-constrained arborescences and shortest path trees. Computing a\ncolor-constrained shortest path tree on weighted digraphs turns out to be\nNP-hard in general but polynomial-time solvable when all cycles have positive\nweight. This polynomial-time solvability is due to the fact that the solution\nspace is essentially the set of all color-constrained arborescences of a\ndirected acyclic subgraph of the original graph. While finding\ncolor-constrained arborescence of digraphs is NP-hard in general, we give\nefficient algorithms when the input graph is acyclic. Consequently, a\ncolor-constrained shortest path tree on weighted digraphs having only positive\nweight cycles can be efficiently computed. Our algorithms also generalize to\nthe problem of finding a color-constrained shortest path tree with minimum\ntotal weight. En route, we sight nice connections to colored matroids and\ncolor-constrained bases.\n","authors":["P. S. Ardra","Jasine Babu","Kritika Kashyap","R. Krithika","Sreejith K. Pallathumadam","Deepak Rajendraprasad"],"pdf_url":"https://arxiv.org/pdf/2403.06580v2.pdf","comment":"Major revision, solving a more generalized problem"},{"id":"http://arxiv.org/abs/2403.03530v2","updated":"2024-07-23T03:04:59Z","published":"2024-03-06T08:08:02Z","title":"Average-case deterministic query complexity of boolean functions with\n fixed weight","summary":" We explore the $\\textit{average-case deterministic query complexity}$ of\nboolean functions under a $\\textit{uniform distribution}$, denoted by\n$\\mathrm{D}_\\mathrm{ave}(f)$, the minimum average depth of zero-error decision\ntree computing a boolean function $f$. This measure has found several\napplications across diverse fields, yet its understanding is limited. We study\n$\\mathrm{D}_\\mathrm{ave}(f)$ of several functions, including the penalty\nshoot-out function, symmetric functions, linear threshold functions and the\ntribes functions. We prove $\\mathrm{D}_\\mathrm{ave}(f) \\le \\max \\{ \\log\n\\frac{\\mathrm{wt}(f)}{\\log n} + O(\\log \\log \\frac{\\mathrm{wt}(f)}{\\log n}),\nO(1) \\}$ for every $n$-variable boolean function $f$, where $\\mathrm{wt}(f)$\ndenotes the weight (the number of inputs on which $f$ outputs $1$). For any\n$4\\log n \\le m(n) \\le 2^{n-1}$, we prove the upper bound is tight up to an\nadditive logarithmic term for almost all $n$-variable boolean functions with\nweight $\\mathrm{wt}(f) = m(n)$. Using H\\r{a}stad's switching lemma or Rossman's\nswitching lemma [Comput. Complexity Conf. 137, 2019], one can derive\n$\\mathrm{D}_\\mathrm{ave}(f) \\leq n(1 - \\frac{1}{O(w)})$ or\n$\\mathrm{D}_\\mathrm{ave}(f) \\le n(1 - \\frac{1}{O(\\log s)})$ for CNF/DNF\nformulas of width $w$ or size $s$, respectively. We show that, for any $w \\ge\n\\log n + \\log \\log n + 3$, there exists a DNF formula of width $w$ and size\n$\\lceil 2^w / w \\rceil$ such that $\\mathrm{D}_\\mathrm{ave}(f) = n (1 -\n\\frac{\\log n}{\\Theta(w)})$. In other words, we show the criticality upper\nbounds $O(w)$ and $O(\\log s)$ are tight up to a multiplicative $\\log n$ factor,\nproviding evidence on the tightness of the switching lemmas.\n","authors":["Yuan Li","Haowei Wu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2403.03530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05764v3","updated":"2024-07-23T02:46:49Z","published":"2023-09-11T18:46:13Z","title":"Equality cases of the Alexandrov--Fenchel inequality are not in the\n polynomial hierarchy","summary":" Describing the equality conditions of the Alexandrov--Fenchel inequality has\nbeen a major open problem for decades. We prove that in the case of convex\npolytopes, this description is not in the polynomial hierarchy unless the\npolynomial hierarchy collapses to a finite level. This is the first hardness\nresult for the problem, and is a complexity counterpart of the recent result by\nShenfeld and van Handel (arXiv:archive/201104059), which gave a geometric\ncharacterization of the equality conditions. The proof involves Stanley's order\npolytopes and employs poset theoretic technology.\n","authors":["Swee Hong Chan","Igor Pak"],"pdf_url":"https://arxiv.org/pdf/2309.05764v3.pdf","comment":"35 pages. Fixed some typos and updated some references. to appear in\n Forum Math. Pi"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2311.17620v2","updated":"2024-07-23T12:56:19Z","published":"2023-11-29T13:28:30Z","title":"Linear Matching of JavaScript Regular Expressions","summary":" Modern regex languages have strayed far from well-understood traditional\nregular expressions: they include features that fundamentally transform the\nmatching problem. In exchange for these features, modern regex engines at times\nsuffer from exponential complexity blowups, a frequent source of\ndenial-of-service vulnerabilities in JavaScript applications. Worse, regex\nsemantics differ across languages, and the impact of these divergences on\nalgorithmic design and worst-case matching complexity has seldom been\ninvestigated.\n This paper provides a novel perspective on JavaScript's regex semantics by\nidentifying a larger-than-previously-understood subset of the language that can\nbe matched with linear time guarantees. In the process, we discover several\ncases where state-of-the-art algorithms were either wrong (semantically\nincorrect), inefficient (suffering from superlinear complexity) or excessively\nrestrictive (assuming certain features could not be matched linearly). We\nintroduce novel algorithms to restore correctness and linear complexity. We\nfurther advance the state-of-the-art in linear regex matching by presenting the\nfirst nonbacktracking algorithms for matching lookarounds in linear time: one\nsupporting captureless lookbehinds in any regex language, and another\nleveraging a JavaScript property to support unrestricted lookaheads and\nlookbehinds. Finally, we describe new time and space complexity tradeoffs for\nregex engines. All of our algorithms are practical: we validated them in a\nprototype implementation, and some have also been merged in the V8 JavaScript\nimplementation used in Chrome and Node.js.\n","authors":["Aurèle Barrière","Clément Pit-Claudel"],"pdf_url":"https://arxiv.org/pdf/2311.17620v2.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2407.16683v1","updated":"2024-07-23T17:48:17Z","published":"2024-07-23T17:48:17Z","title":"Gödel logics: Prenex fragments","summary":" In this paper, we provide a complete classification for the first-order\nG\\\"odel logics concerning the property that the formulas admit logically\nequivalent prenex normal forms. We show that the only first-order G\\\"odel\nlogics that admit such prenex forms are those with finite truth value sets\nsince they allow all quantifier-shift rules and the logic \\(G_\\uparrow\\) with\nonly one accumulation point at $1$ in the infinite truth values set. In all the\nother cases, there are generally no logically equivalent prenex normal forms.\nWe will also see that \\(G_\\uparrow\\) is the intersection of all finite\nfirst-order G\\\"odel logics.\\\\ The second part of this paper investigates the\nexistence of effective equivalence between the validity of a formula and the\nvalidity of some prenex normal form. The existence of such a normal form is\nobvious for finite valued G\\\"odel logic and \\(G_\\uparrow\\). G\\\"odel logics with\nan uncountable truth value set admit the prenex normal forms if and only if\nevery surrounding of \\(0\\) is uncountable or \\(0\\) is an isolated point.\nOtherwise, uncountable G\\\"odel logics are not recursively enumerable, however,\nthe prenex fragment is always recursively enumerable. Therefore, there is no\neffective translation between the valid formula and the valid prenex normal\nform. However, the existence of effectively constructible validity equivalent\nprenex forms for the countable case is still up for debate.\n","authors":["Matthias Baaz","Mariami Gamsakhurdia"],"pdf_url":"https://arxiv.org/pdf/2407.16683v1.pdf","comment":"Research supported by FWF grant P 36571"},{"id":"http://arxiv.org/abs/2407.16629v1","updated":"2024-07-23T16:40:39Z","published":"2024-07-23T16:40:39Z","title":"Efficient Discovery of Actual Causality using Abstraction-Refinement","summary":" Causality is an influence by which one event contributes to the production of\nanother event, where the cause is partly responsible for the effect, and the\neffect is partly dependent on the cause. In this paper, we propose a novel and\neffective method to formally reason about the causal effect of events in\nengineered systems, with application on finding the root-cause of safety\nviolations in embedded and cyber-physical systems. We are motivated by the\nnotion of actual causality by Halpern and Pearl, which focuses on the causal\neffect of particular events, rather than type-level causality, which attempts\nto make general statements about scientific and natural phenomena. Our first\ncontribution is formulating discovery of actual causality in computing systems\nmodeled by a transition systems as an SMT solving problem. Since datasets for\ncausality analysis tend to be large, in order to tackle the scalability problem\nof automated formal reasoning, our second contribution is a novel technique\nbased on abstraction-refinement that allows identifying actual causes within\nsmaller abstract causal models. We demonstrate the effectiveness of our\napproach (by several orders of magnitude) using three case studies to find the\nactual cause of violations of safety in (1) a neural network controller for an\nmountain car, (2) a controller for a lunar lander obtained by reinforcement\nlearning, and (3) an MPC controller for an F-16 autopilot simulator.\n","authors":["Arshia Rafieioskouei","Borzoo Bonakdarpour"],"pdf_url":"https://arxiv.org/pdf/2407.16629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.04298v4","updated":"2024-07-23T14:01:28Z","published":"2021-07-09T08:18:53Z","title":"An Algorithm for Reversible Logic Circuit Synthesis Based on Tensor\n Decomposition","summary":" An algorithm for reversible logic synthesis is proposed. The task is, for a\ngiven $n$-bit substitution map $P_n: \\{0,1\\}^n \\rightarrow \\{0,1\\}^n$, to find\na sequence of reversible logic gates that implements the map. The gate library\nadopted in this work consists of multiple-controlled Toffoli gates denoted by\n$C^m\\!X$, where $m$ is the number of control bits that ranges from 0 to $n-1$.\nControlled gates with large $m \\,\\,(>2)$ are then further decomposed into\n$C^0\\!X$, $C^1\\!X$, and $C^2\\!X$ gates. A primary concern in designing the\nalgorithm is to reduce the use of $C^2\\!X$ gate (also known as Toffoli gate)\nwhich is known to be universal.\n The main idea is to view an $n$-bit substitution map as a rank-$2n$ tensor\nand to transform it such that the resulting map can be written as a tensor\nproduct of a rank-($2n-2$) tensor and the $2\\times 2$ identity matrix. Let\n$\\mathcal{P}_n$ be a set of all $n$-bit substitution maps. What we try to find\nis a size reduction map $\\mathcal{A}_{\\rm red}: \\mathcal{P}_n \\rightarrow\n\\{P_n: P_n = P_{n-1} \\otimes I_2\\}$. %, where $I_m$ is the $m\\times m$ identity\nmatrix. One can see that the output $P_{n-1} \\otimes I_2$ acts nontrivially on\n$n-1$ bits only, meaning that the map to be synthesized becomes $P_{n-1}$. The\nsize reduction process is iteratively applied until it reaches tensor product\nof only $2 \\times 2$ matrices.\n","authors":["Hochang Lee","Kyung Chul Jeong","Daewan Han","Panjin Kim"],"pdf_url":"https://arxiv.org/pdf/2107.04298v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19632v2","updated":"2024-07-23T07:40:14Z","published":"2024-04-30T15:34:51Z","title":"Behavioural Metrics: Compositionality of the Kantorovich Lifting and an\n Application to Up-To Techniques","summary":" Behavioural distances of transition systems modelled via coalgebras for\nendofunctors generalize traditional notions of behavioural equivalence to a\nquantitative setting, in which states are equipped with a measure of how\n(dis)similar they are. Endowing transition systems with such distances\nessentially relies on the ability to lift functors describing the one-step\nbehavior of the transition systems to the category of pseudometric spaces. We\nconsider the category theoretic generalization of the Kantorovich lifting from\ntransportation theory to the case of lifting functors to quantale-valued\nrelations, which subsumes equivalences, preorders and (directed) metrics. We\nuse tools from fibred category theory, which allow one to see the Kantorovich\nlifting as arising from an appropriate fibred adjunction. Our main\ncontributions are compositionality results for the Kantorovich lifting, where\nwe show that that the lifting of a composed functor coincides with the\ncomposition of the liftings. In addition, we describe how to lift distributive\nlaws in the case where one of the two functors is polynomial (with finite\ncoproducts). These results are essential ingredients for adapting\nup-to-techniques to the case of quantale-valued behavioural distances. Up-to\ntechniques are a well-known coinductive technique for efficiently showing lower\nbounds for behavioural distances. We illustrate the results of our paper in two\ncase studies.\n","authors":["Keri D'Angelo","Sebastian Gurke","Johanna Maria Kirss","Barbara König","Matina Najafi","Wojciech Różowski","Paul Wild"],"pdf_url":"https://arxiv.org/pdf/2404.19632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14347v3","updated":"2024-07-23T02:50:29Z","published":"2023-11-24T08:48:00Z","title":"Typed compositional quantum computation with lenses","summary":" We propose a type-theoretic framework for describing and proving properties\nof quantum computations, in particular those presented as quantum circuits. Our\nproposal is based on an observation that, in the polymorphic type system of\nCoq, currying on quantum states allows us to apply quantum gates directly\ninside a complex circuit. By introducing a discrete notion of lens to control\nthis currying, we are further able to separate the combinatorics of the circuit\nstructure from the computational content of gates. We apply our development to\ndefine quantum circuits recursively from the bottom up, and prove their\ncorrectness compositionally.\n","authors":["Jacques Garrigue","Takafumi Saikawa"],"pdf_url":"https://arxiv.org/pdf/2311.14347v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16117v1","updated":"2024-07-23T01:39:47Z","published":"2024-07-23T01:39:47Z","title":"A Logic for Veracity: Development and Implementation","summary":" In the business rules of supply chains, there are concerns around trust,\ntruth, demonstrability and authenticity. These concerns are gathered together\nunder the name ``veracity\".\n In the work for this paper we were originally motivated by the requirement\naround organic certification in the wine industry in New Zealand, but veracity\narises in many different situations and our formalisation shows how formal\nmethods can give insights into many such practical problems.\n One activity for formal methods involves taking informal processes and\nformalising them and subsequently building tools to support this formalisation\nand therefore the original processes too, and the work reported here is an\nexample of that.\n Here, then, we explore the idea of veracity in this spirit, give highlights\nof the development of a logic for it and show how that logic can be implemented\nin Coq, both for proof support and automation.\n","authors":["Daniel Britten","Steve Reeves"],"pdf_url":"https://arxiv.org/pdf/2407.16117v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.06164"},{"id":"http://arxiv.org/abs/2404.19724v2","updated":"2024-07-23T23:40:06Z","published":"2024-04-30T17:19:30Z","title":"Sound and Complete Proof Rules for Probabilistic Termination","summary":" Deciding termination is a fundamental problem in the analysis of\nprobabilistic imperative programs. We consider the qualitative and quantitative\nprobabilistic termination problems for an imperative programming model with\ndiscrete probabilistic choice and demonic bounded nondeterminism. The\nqualitative question asks if the program terminates almost-surely, no matter\nhow nondeterminism is resolved. The quantitative question asks for a bound on\nthe probability of termination. Despite a long and rich literature on the\ntopic, no sound and relatively complete proof systems were known for these\nproblems. In this paper, we provide such sound and relatively complete proof\nrules for proving qualitative and quantitative termination in the assertion\nlanguage of arithmetic. Our rules use supermartingales as estimates of the\nlikelihood of a program's evolution and variants as measures of distances to\ntermination. Our key insight is our completeness result, which shows how to\nconstruct a suitable supermartingales from an almost-surely terminating\nprogram. We also show that proofs of termination in many existing proof systems\ncan be transformed to proofs in our system, pointing to its applicability in\npractice. As an application of our proof rule, we show an explicit proof of\nalmost-sure termination for the two-dimensional random walker.\n","authors":["Rupak Majumdar","V. R. Sathiyanarayana"],"pdf_url":"https://arxiv.org/pdf/2404.19724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16827v1","updated":"2024-07-23T20:35:33Z","published":"2024-07-23T20:35:33Z","title":"Path-optimal symbolic execution of heap-manipulating programs","summary":" Symbolic execution is at the core of many techniques for program analysis and\ntest generation. Traditional symbolic execution of programs with numeric inputs\nenjoys the property of forking as many analysis traces as the number of\nanalyzed program paths, a property that in this paper we refer to as path\noptimality. On the contrary, current approaches for symbolic execution of\nheap-manipulating programs fail to satisfy this property, thereby incurring\nheavy path explosion effects that crucially penalize the efficiency of the\nanalysis. This paper introduces POSE, path-optimal symbolic execution, a\nsymbolic execution algorithm that originally accomplishes path optimality\nagainst heap-manipulating programs. We formalize the POSE algorithm for a tiny,\nbut representative object-oriented programming language, and implement the\nformalization into a prototype symbolic executor to experiment the algorithm\nagainst a benchmark of sample programs that take data structures as inputs. Our\nexperiments provide initial empirical evidence of the potential of POSE for\nimproving on the state of the art of symbolic execution of heap-manipulating\nprograms.\n","authors":["Pietro Braione","Giovanni Denaro"],"pdf_url":"https://arxiv.org/pdf/2407.16827v1.pdf","comment":"16 pages, 12 figures"}]},"2024-07-24T00:00:00Z":{"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2407.16683v2","updated":"2024-07-24T06:56:33Z","published":"2024-07-23T17:48:17Z","title":"Goedel logics: Prenex fragments","summary":" In this paper, we provide a complete classification for the first-order\nGoedel logics concerning the property that the formulas admit logically\nequivalent prenex normal forms. We show that the only first-order Goedel logics\nthat admit such prenex forms are those with finite truth value sets since they\nallow all quantifier-shift rules and the logic $G_\\uparrow$ with only one\naccumulation point at 1 in the infinite truth value set. In all the other\ncases, there are generally no logically equivalent prenex normal forms. We will\nalso see that $G_\\uparrow$ is the intersection of all finite first-order Goedel\nlogics.\n The second part of this paper investigates the existence of effective\nequivalence between the validity of a formula and the validity of some prenex\nnormal form. The existence of such a normal form is obvious for finite valued\nGoedel logic and $G_\\uparrow$. Goedel logics with an uncountable truth value\nset admit the prenex normal forms if and only if every surrounding of 0 is\nuncountable or 0 is an isolated point. Otherwise, uncountable Goedel logics are\nnot recursively enumerable, however, the prenex fragment is always recursively\nenumerable. Therefore, there is no effective translation between the valid\nformula and the valid prenex normal form. However, the existence of effectively\nconstructible validity equivalent prenex forms for the countable case is still\nup for debate.\n","authors":["Matthias Baaz","Mariami Gamsakhurdia"],"pdf_url":"https://arxiv.org/pdf/2407.16683v2.pdf","comment":"Research supported by FWF grant P 36571"},{"id":"http://arxiv.org/abs/2407.17289v1","updated":"2024-07-24T14:00:02Z","published":"2024-07-24T14:00:02Z","title":"Static and Dynamic Verification of OCaml Programs: The Gospel Ecosystem\n (Extended Version)","summary":" We present our work on the collaborative use of dynamic and static analysis\ntools for the verification of software written in the OCaml language. We build\nupon Gospel, a specification language for OCaml that can be used both in\ndynamic and static analyses. We employ Ortac, for runtime assertion checking,\nand Cameleer and CFML for the deductive verification of OCaml code. We report\non the use of such tools to build a case study of collaborative analysis of a\nnon-trivial OCaml program. This shows how these tools nicely complement each\nothers, while at the same highlights the differences when writing specification\ntargeting dynamic or static analysis methods.\n","authors":["Tiago Lopes Soares","Ion Chririca","Mário Pereira"],"pdf_url":"https://arxiv.org/pdf/2407.17289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17215v1","updated":"2024-07-24T12:15:31Z","published":"2024-07-24T12:15:31Z","title":"Formalizing UML State Machines for Automated Verification -- A Survey","summary":" The Unified Modeling Language (UML) is a standard for modeling dynamic\nsystems. UML behavioral state machines are used for modeling the dynamic\nbehavior of object-oriented designs. The UML specification, maintained by the\nObject Management Group (OMG), is documented in natural language (in contrast\nto formal language). The inherent ambiguity of natural languages may introduce\ninconsistencies in the resulting state machine model. Formalizing UML state\nmachine specification aims at solving the ambiguity problem and at providing a\nuniform view to software designers and developers. Such a formalization also\naims at providing a foundation for automatic verification of UML state machine\nmodels, which can help to find software design vulnerabilities at an early\nstage and reduce the development cost. We provide here a comprehensive survey\nof existing work from 1997 to 2021 related to formalizing UML state machine\nsemantics for the purpose of conducting model checking at the design stage.\n","authors":["Étienne André","Shuang Liu","Yang Liu","Christine Choppy","Jun Sun","Jin Song Dong"],"pdf_url":"https://arxiv.org/pdf/2407.17215v1.pdf","comment":"This is the author version of the manuscript of the same name\n published in ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2407.17127v1","updated":"2024-07-24T09:58:21Z","published":"2024-07-24T09:58:21Z","title":"A quantitative probabilistic relational Hoare logic","summary":" We introduce eRHL, a program logic for reasoning about relational expectation\nproperties of pairs of probabilistic programs. eRHL is quantitative, i.e., its\npre- and post-conditions take values in the extended non-negative reals. Thanks\nto its quantitative assertions, eRHL overcomes randomness alignment\nrestrictions from prior logics, including PRHL, a popular relational program\nlogic used to reason about security of cryptographic constructions, and apRHL,\na variant of PRHL for differential privacy. As a result, eRHL is the first\nrelational probabilistic program logic to be supported by non-trivial soundness\nand completeness results for all almost surely terminating programs. We show\nthat eRHL is sound and complete with respect to program equivalence,\nstatistical distance, and differential privacy. We also show that every PRHL\njudgment is valid iff it is provable in eRHL. We showcase the practical\nbenefits of eRHL with examples that are beyond reach of PRHL and apRHL.\n","authors":["Martin Avanzini","Gilles Barthe","Davide Davoli","Benjamin Grégoire"],"pdf_url":"https://arxiv.org/pdf/2407.17127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12935v2","updated":"2024-07-24T08:06:41Z","published":"2023-10-19T17:33:21Z","title":"Representing Sugihara monoids via weakening relations","summary":" We show that all Sugihara monoids can be represented as algebras of binary\nrelations, with the monoid operation given by relational composition. Moreover,\nthe binary relations are weakening relations. The first step is to obtain an\nexplicit relational representation of all finite odd Sugihara chains. Our\nconstruction mimics that of Maddux (2010), where a relational representation of\nthe finite even Sugihara chains is given. We define the class of representable\nSugihara monoids as those which can be represented as reducts of distributive\ninvolutive FL-algebras of binary relations. We then show that the class of\nrepresentable distributive involutive FL-algebras is closed under\nultraproducts. This fact is used to demonstrate that the two infinite Sugihara\nmonoids that generate the quasivariety are also representable. From this it\nfollows that all Sugihara monoids are representable.\n","authors":["Andrew Craig","Claudette Robinson"],"pdf_url":"https://arxiv.org/pdf/2310.12935v2.pdf","comment":"29 pages, 1 figure"},{"id":"http://arxiv.org/abs/2309.01261v2","updated":"2024-07-24T03:10:07Z","published":"2023-09-03T19:54:14Z","title":"Worst-Case Input Generation for Concurrent Programs under Non-Monotone\n Resource Metrics","summary":" Worst-case input generation aims to automatically generate inputs that\nexhibit the worst-case performance of programs. It has several applications,\nand can, for example, detect vulnerabilities to denial-of-service attacks.\nHowever, it is non-trivial to generate worst-case inputs for concurrent\nprograms, particularly for resources like memory where the peak cost depends on\nhow processes are scheduled.\n This article presents the first sound worst-case input generation algorithm\nfor concurrent programs under non-monotone resource metrics like memory. The\nkey insight is to leverage resource-annotated session types and symbolic\nexecution. Session types describe communication protocols on channels in\nprocess calculi. Equipped with resource annotations, resource-annotated session\ntypes not only encode cost bounds but also indicate how many resources can be\nreused and transferred between processes. This information is critical for\nidentifying a worst-case execution path during symbolic execution. The\nalgorithm is sound: if it returns any input, it is guaranteed to be a valid\nworst-case input. The algorithm is also relatively complete: as long as\nresource-annotated session types are sufficiently expressive and the background\ntheory for SMT solving is decidable, a worst-case input is guaranteed to be\nreturned. A simple case study of a web server's memory usage demonstrates the\nutility of the worst-case input generation algorithm.\n","authors":["Long Pham","Jan Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2309.01261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.06757v7","updated":"2024-07-24T18:21:58Z","published":"2021-01-17T19:24:46Z","title":"Higher Order Automatic Differentiation of Higher Order Functions","summary":" We present semantic correctness proofs of automatic differentiation (AD). We\nconsider a forward-mode AD method on a higher order language with algebraic\ndata types, and we characterise it as the unique structure preserving macro\ngiven a choice of derivatives for basic operations. We describe a rich\nsemantics for differentiable programming, based on diffeological spaces. We\nshow that it interprets our language, and we phrase what it means for the AD\nmethod to be correct with respect to this semantics. We show that our\ncharacterisation of AD gives rise to an elegant semantic proof of its\ncorrectness based on a gluing construction on diffeological spaces. We explain\nhow this is, in essence, a logical relations argument. Throughout, we show how\nthe analysis extends to AD methods for computing higher order derivatives using\na Taylor approximation.\n","authors":["Mathieu Huot","Sam Staton","Matthijs Vákár"],"pdf_url":"https://arxiv.org/pdf/2101.06757v7.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2001.02209"},{"id":"http://arxiv.org/abs/2407.17537v1","updated":"2024-07-24T08:35:50Z","published":"2024-07-24T08:35:50Z","title":"A process algebraic framework for multi-agent dynamic epistemic systems","summary":" This paper combines the classical model of labeled transition systems with\nthe epistemic model for reasoning about knowledge. The result is a unifying\nframework for modeling and analyzing multi-agent, knowledge-based, dynamic\nsystems. On the modeling side, we propose a process algebraic, agent-oriented\nspecification language that makes such a framework easy to use for practical\npurposes. On the verification side, we define a modal logic encompassing\ntemporal and epistemic operators.\n","authors":["Alessandro Aldini"],"pdf_url":"https://arxiv.org/pdf/2407.17537v1.pdf","comment":null}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2407.13018v2","updated":"2024-07-24T17:04:35Z","published":"2024-07-17T21:14:05Z","title":"Proof-of-Collaborative-Learning: A Multi-winner Federated Learning\n Consensus Algorithm","summary":" Regardless of their variations, blockchains require a consensus mechanism to\nvalidate transactions, supervise added blocks, maintain network security,\nsynchronize the network state, and distribute incentives. Proof-of-Work (PoW),\none of the most influential implementations of consensus mechanisms, consumes\nan extraordinary amount of energy for a task that lacks direct productive\noutput. In this paper, we propose Proof-of-Collaborative-Learning (PoCL), a\nmulti-winner federated learning validated consensus mechanism that redirects\nthe computation power of blockchains to train federated learning models. In\naddition, we present a novel evaluation mechanism to ensure the efficiency of\nthe locally trained models of miners. We evaluated the security of our\nevaluation mechanism by introducing and conducting probable attacks. Moreover,\nwe present a novel reward distribution mechanism to incentivize winning miners\nfairly, and demonstrate that our reward system is fair both within and across\nall rounds.\n","authors":["Amirreza Sokhankhosh","Sara Rouhani"],"pdf_url":"https://arxiv.org/pdf/2407.13018v2.pdf","comment":"8 pages. Accepted at the 7th IEEE International Conference on\n Blockchain (Blockchain 2024)"},{"id":"http://arxiv.org/abs/2407.17391v1","updated":"2024-07-24T16:14:38Z","published":"2024-07-24T16:14:38Z","title":"Tutorial: Object as a Service (OaaS) Serverless Cloud Computing Paradigm","summary":" While the first generation of cloud computing systems mitigated the job of\nsystem administrators, the next generation of cloud computing systems is\nemerging to mitigate the burden for cloud developers -- facilitating the\ndevelopment of cloud-native applications. This paradigm shift is primarily\nhappening by offering higher-level serverless abstractions, such as Function as\na Service (FaaS). Although FaaS has successfully abstracted developers from the\ncloud resource management details, it falls short in abstracting the management\nof both data (i.e., state) and the non-functional aspects, such as Quality of\nService (QoS) requirements. The lack of such abstractions implies developer\nintervention and is counterproductive to the objective of mitigating the burden\nof cloud-native application development. To further streamline cloud-native\napplication development, we present Object-as-a-Service (OaaS) -- a serverless\nparadigm that borrows the object-oriented programming concepts to encapsulate\napplication logic and data in addition to non-functional requirements into a\nsingle deployment package, thereby streamlining provider-agnostic cloud-native\napplication development. We realized the OaaS paradigm through the development\nof an open-source platform called Oparaca. In this tutorial, we will present\nthe concept and design of the OaaS paradigm and its implementation -- the\nOparaca platform. Then, we give a tutorial on developing and deploying the\napplication on the Oparaca platform and discuss its benefits and its optimal\nconfigurations to avoid potential overheads.\n","authors":["Pawissanutt Lertpongrujikorn","Mohsen Amini Salehi"],"pdf_url":"https://arxiv.org/pdf/2407.17391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06348v2","updated":"2024-07-24T16:13:45Z","published":"2024-06-10T15:08:14Z","title":"Causal Discovery over High-Dimensional Structured Hypothesis Spaces with\n Causal Graph Partitioning","summary":" The aim in many sciences is to understand the mechanisms that underlie the\nobserved distribution of variables, starting from a set of initial hypotheses.\nCausal discovery allows us to infer mechanisms as sets of cause and effect\nrelationships in a generalized way -- without necessarily tailoring to a\nspecific domain. Causal discovery algorithms search over a structured\nhypothesis space, defined by the set of directed acyclic graphs, to find the\ngraph that best explains the data. For high-dimensional problems, however, this\nsearch becomes intractable and scalable algorithms for causal discovery are\nneeded to bridge the gap. In this paper, we define a novel causal graph\npartition that allows for divide-and-conquer causal discovery with theoretical\nguarantees. We leverage the idea of a superstructure -- a set of learned or\nexisting candidate hypotheses -- to partition the search space. We prove under\ncertain assumptions that learning with a causal graph partition always yields\nthe Markov Equivalence Class of the true causal graph. We show our algorithm\nachieves comparable accuracy and a faster time to solution for\nbiologically-tuned synthetic networks and networks up to ${10^4}$ variables.\nThis makes our method applicable to gene regulatory network inference and other\ndomains with high-dimensional structured hypothesis spaces.\n","authors":["Ashka Shah","Adela DePavia","Nathaniel Hudson","Ian Foster","Rick Stevens"],"pdf_url":"https://arxiv.org/pdf/2406.06348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17325v1","updated":"2024-07-24T14:50:01Z","published":"2024-07-24T14:50:01Z","title":"Noise-Aware Distributed Quantum Approximate Optimization Algorithm on\n Near-term Quantum Hardware","summary":" This paper introduces a noise-aware distributed Quantum Approximate\nOptimization Algorithm (QAOA) tailored for execution on near-term quantum\nhardware. Leveraging a distributed framework, we address the limitations of\ncurrent Noisy Intermediate-Scale Quantum (NISQ) devices, which are hindered by\nlimited qubit counts and high error rates. Our approach decomposes large QAOA\nproblems into smaller subproblems, distributing them across multiple Quantum\nProcessing Units (QPUs) to enhance scalability and performance. The noise-aware\nstrategy incorporates error mitigation techniques to optimize qubit fidelity\nand gate operations, ensuring reliable quantum computations. We evaluate the\nefficacy of our framework using the HamilToniQ Benchmarking Toolkit, which\nquantifies the performance across various quantum hardware configurations. The\nresults demonstrate that our distributed QAOA framework achieves significant\nimprovements in computational speed and accuracy, showcasing its potential to\nsolve complex optimization problems efficiently in the NISQ era. This work sets\nthe stage for advanced algorithmic strategies and practical quantum system\nenhancements, contributing to the broader goal of achieving quantum advantage.\n","authors":["Kuan-Cheng Chen","Xiatian Xu","Felix Burt","Chen-Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17316v1","updated":"2024-07-24T14:39:24Z","published":"2024-07-24T14:39:24Z","title":"Lossy Data Compression By Adaptive Mesh Coarsening","summary":" Today's scientific simulations, for example in the high-performance exascale\nsector, produce huge amounts of data. Due to limited I/O bandwidth and\navailable storage space, there is the necessity to reduce scientific data of\nhigh performance computing applications. Error-bounded lossy compression has\nbeen proven to be an effective approach tackling the trade-off between accuracy\nand storage space. Within this work, we are exploring and discussing\nerror-bounded lossy compression solely based on adaptive mesh refinement\ntechniques. This compression technique is not only easily integrated into\nexisting adaptive mesh refinement applications but also suits as a general\nlossy compression approach for arbitrary data in form of multi-dimensional\narrays, irrespective of the data type. Moreover, these techniques permit the\nexclusion of regions of interest and even allows for nested error domains\nduring the compression. The described data compression technique is presented\nexemplary on ERA5 data.\n","authors":["N. Böing","J. Holke","C. Hergl","L. Spataro","G. Gassner","A. Basermann"],"pdf_url":"https://arxiv.org/pdf/2407.17316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17314v1","updated":"2024-07-24T14:38:03Z","published":"2024-07-24T14:38:03Z","title":"Edge-Cloud Continuum Orchestration of Critical Services: A Smart-City\n Approach","summary":" Smart-city services are typically developed as closed systems within each\ncity's vertical, communicating and interacting with cloud services while\nremaining isolated within each provider's domain. With the emergence of 5G\nprivate domains and the introduction of new M2M services focusing on autonomous\nsystems, there is a shift from the cloud-based approach to a distributed edge\ncomputing paradigm, in a \\textit{continuum} orchestration. However, an\nessential component is missing. Current orchestration tools, designed for\ncloud-based deployments, lack robust workload isolation, fail to meet timing\nconstraints, and are not tailored to the resource-constrained nature of edge\ndevices. Therefore, new orchestration methods are needed to support MEC\nenvironments. The work presented in this paper addresses this gap. Based on the\nreal needs of a smart-city testbed - the Aveiro Living Lab-, we developed a set\nof orchestration components to facilitate the seamless orchestration of both\ncloud and edge-based services, encompassing both critical and non-critical\nservices. This work extends the current Kubernetes orchestration platform to\ninclude a novel location-specific resource definition, a custom scheduler to\naccommodate real-time and legacy services, continuous service monitoring to\ndetect sub-optimal states, and a refined load balancing mechanism that\nprioritizes the fastest response times.\n","authors":["Rodrigo Rosmaninho","Duarte Raposo","Pedro Rito","Susana Sargento"],"pdf_url":"https://arxiv.org/pdf/2407.17314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17287v1","updated":"2024-07-24T13:56:56Z","published":"2024-07-24T13:56:56Z","title":"Software Defined Vehicles for Development of Deterministic Services","summary":" With modern vehicles evolving with more features, services, complex systems,\nwith more sensors, actuators, and processing units, it is essential to think\nabout vehicles not only as means of transportation that may tend towards full\nautonomy, but also as adaptive objects, that suit themselves to the needs of\noccupants. Vehicular services can be developed to support these adaptations.\nHowever, the increasing complexity of vehicular service development, even with\ncurrent standardizations and best practices and guidelines, are insufficient to\ntackle the high complexity of development, with expectations of up to 1 (U.S.)\nbillion lines of code for a fully (level 5) autonomous vehicle. Within this\nsurvey, the paradigm of Deterministic Software Defined Vehicles is explored\ntowards increasing the quality and easiness of the development of services for\nautomotive. Towards this, a proposed vision with four pillars is also provided:\nthe deterministic network configurator, the data layer configurator, and the\nhypervisor configurator and the vehicle abstraction layer, all coordinated by a\nsoftware orchestrator.\n","authors":["Pedro Veloso Teixeira","Duarte Raposo","Rui Lopes","Susana Sargento"],"pdf_url":"https://arxiv.org/pdf/2407.17287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17276v1","updated":"2024-07-24T13:42:46Z","published":"2024-07-24T13:42:46Z","title":"Bridging Trust into the Blockchain: A Systematic Review on On-Chain\n Identity","summary":" The ongoing regulation of blockchain-based services and applications requires\nthe identification of users who are issuing transactions on the blockchain.\nThis systematic review explores the current status, identifies research gaps,\nand outlines future research directions for establishing trusted and\nprivacy-compliant identities on the blockchain (on-chain identity). A\nsystematic search term was applied across various scientific databases,\ncollecting 2232 potentially relevant research papers. These papers were\nnarrowed down in two methodologically executed steps to 98 and finally to 13\nrelevant sources. The relevant articles were then systematically analyzed based\non a set of screening questions. The results of the selected studies have\nprovided insightful findings on the mechanisms of on-chain identities. On-chain\nidentities are established using zero-knowledge proofs, public key\ninfrastructure/certificates, and web of trust approaches. The technologies and\narchitectures used by the authors are also highlighted. Trust has emerged as a\nkey research gap, manifesting in two ways: firstly, a gap in how to trust the\ndigital identity representation of a physical human; secondly, a gap in how to\ntrust identity providers that issue identity confirmations on-chain. Potential\nfuture research avenues are suggested to help fill the current gaps in\nestablishing trust and on-chain identities.\n","authors":["Awid Vaziry","Kaustabh Barman","Patrick Herbke"],"pdf_url":"https://arxiv.org/pdf/2407.17276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09901v2","updated":"2024-07-24T11:43:38Z","published":"2022-08-21T14:58:15Z","title":"Scalable mRMR feature selection to handle high dimensional datasets:\n Vertical partitioning based Iterative MapReduce framework","summary":" While building machine learning models, Feature selection (FS) stands out as\nan essential preprocessing step used to handle the uncertainty and vagueness in\nthe data. Recently, the minimum Redundancy and Maximum Relevance (mRMR)\napproach has proven to be effective in obtaining the irredundant feature\nsubset. Owing to the generation of voluminous datasets, it is essential to\ndesign scalable solutions using distributed/parallel paradigms. MapReduce\nsolutions are proven to be one of the best approaches to designing\nfault-tolerant and scalable solutions. This work analyses the existing\nMapReduce approaches for mRMR feature selection and identifies the limitations\nthereof. In the current study, we proposed VMR_mRMR, an efficient vertical\npartitioning-based approach using a memorization approach, thereby overcoming\nthe extant approaches limitations. The experiment analysis says that VMR_mRMR\nsignificantly outperformed extant approaches and achieved a better\ncomputational gain (C.G). In addition, we also conducted a comparative analysis\nwith the horizontal partitioning approach HMR_mRMR [1] to assess the strengths\nand limitations of the proposed approach.\n","authors":["Yelleti Vivek","P. S. V. S. Sai Prasad"],"pdf_url":"https://arxiv.org/pdf/2208.09901v2.pdf","comment":"20 pages, 3 Figures, 5 Tables"},{"id":"http://arxiv.org/abs/2407.05678v4","updated":"2024-07-24T09:41:04Z","published":"2024-07-08T07:25:26Z","title":"Computational Power of Mobile Robots in Synchronous Environment:\n Discrete Version","summary":" In distributed computing by mobile robots, robots are deployed over a region,\ncontinuous or discrete, operating through a sequence of\n\\textit{look-compute-move} cycles. An extensive study has been carried out to\nunderstand the computational powers of different robot models. The models vary\non the ability to 1)~remember constant size information and 2)~communicate\nconstant size message. Depending on the abilities the different models are\n1)~$\\mathcal{OBLOT}$ (robots are oblivious and silent), 2)~$\\mathcal{FSTA}$\n(robots have finite states but silent), 3)~$\\mathcal{FCOM}$ (robots are\noblivious but can communicate constant size information) and,\n4)~$\\mathcal{LUMI}$ (robots have finite states and can communicate constant\nsize information). Another factor that affects computational ability is the\nscheduler that decides the activation time of the robots. The main three\nschedulers are \\textit{fully-synchronous}, \\textit{semi-synchronous} and\n\\textit{asynchronous}. Combining the models ($M$) with schedulers ($K$), we\nhave twelve combinations $M^K$.\n In the euclidean domain, the comparisons between these twelve variants have\nbeen done in different works for transparent robots, opaque robots, and robots\nwith limited visibility. There is a vacant space for similar works when robots\nare operating on discrete regions like networks. It demands separate research\nattention because there have been a series of works where robots operate on\ndifferent networks, and there is a fundamental difference when robots are\noperating on a continuous domain versus a discrete domain in terms of robots'\nmovement. This work contributes to filling the space by giving a full\ncomparison table for all models with two synchronous schedulers:\nfully-synchronous and semi-synchronous.\n","authors":["Avisek Sharma","Pritam Goswami","Buddhadeb Sau"],"pdf_url":"https://arxiv.org/pdf/2407.05678v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01261v2","updated":"2024-07-24T03:10:07Z","published":"2023-09-03T19:54:14Z","title":"Worst-Case Input Generation for Concurrent Programs under Non-Monotone\n Resource Metrics","summary":" Worst-case input generation aims to automatically generate inputs that\nexhibit the worst-case performance of programs. It has several applications,\nand can, for example, detect vulnerabilities to denial-of-service attacks.\nHowever, it is non-trivial to generate worst-case inputs for concurrent\nprograms, particularly for resources like memory where the peak cost depends on\nhow processes are scheduled.\n This article presents the first sound worst-case input generation algorithm\nfor concurrent programs under non-monotone resource metrics like memory. The\nkey insight is to leverage resource-annotated session types and symbolic\nexecution. Session types describe communication protocols on channels in\nprocess calculi. Equipped with resource annotations, resource-annotated session\ntypes not only encode cost bounds but also indicate how many resources can be\nreused and transferred between processes. This information is critical for\nidentifying a worst-case execution path during symbolic execution. The\nalgorithm is sound: if it returns any input, it is guaranteed to be a valid\nworst-case input. The algorithm is also relatively complete: as long as\nresource-annotated session types are sufficiently expressive and the background\ntheory for SMT solving is decidable, a worst-case input is guaranteed to be\nreturned. A simple case study of a web server's memory usage demonstrates the\nutility of the worst-case input generation algorithm.\n","authors":["Long Pham","Jan Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2309.01261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17651v1","updated":"2024-07-24T21:35:29Z","published":"2024-07-24T21:35:29Z","title":"PARS3: Parallel Sparse Skew-Symmetric Matrix-Vector Multiplication with\n Reverse Cuthill-McKee Reordering","summary":" Sparse matrices, as prevalent primitive of various scientific computing\nalgorithms, persist as a bottleneck in processing. A skew-symmetric matrix\nflips signs of symmetric pairs in a symmetric matrix. Our work, Parallel 3-Way\nBanded Skew-Symmetric Sparse Matrix-Vector Multiplication, equally improves\nparallel symmetric SpMV kernels with a different perspective than the common\nliterature trends, by manipulating the form of matrix in a preprocessing step\nto accelerate the repeated computations of iterative solvers. We effectively\nuse Reverse Cuthill-McKee (RCM) reordering algorithm to transform a sparse\nskew-symmetrix matrix into a band matrix, then efficiently parallelize it by\nsplitting the band structure into 3 different parts by considering its local\nsparsity. Our proposed method with RCM is novel in the sense that it is the\nfirst implementation of parallel skew-symmetric SpMV kernels. Our enhancements\nin SpMV and findings are valuable with significant strong scalings of up to 19x\nover the serial compressed SpMV implementation. We overperform a\nheuristic-based graph-coloring approach with synchronization phases in\nimplementing parallel symmetric SpMVs. Our approach also naturally applies to\nparallel sparse symmetric SpMVs, that can inspire widespread SpMV solutions to\nadapt presented optimizations in this paper.\n","authors":["Selin Yildirim","Murat Manguoglu"],"pdf_url":"https://arxiv.org/pdf/2407.17651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17533v1","updated":"2024-07-24T04:22:37Z","published":"2024-07-24T04:22:37Z","title":"SFPrompt: Communication-Efficient Split Federated Fine-Tuning for Large\n Pre-Trained Models over Resource-Limited Devices","summary":" Large pre-trained models have exhibited remarkable achievements across\nvarious domains. The substantial training costs associated with these models\nhave led to wide studies of fine-tuning for effectively harnessing their\ncapabilities in solving downstream tasks. Yet, conventional fine-tuning\napproaches become infeasible when the model lacks access to downstream data due\nto privacy concerns. Naively integrating fine-tuning approaches with the\nemerging federated learning frameworks incurs substantial communication\noverhead and exerts high demand on local computing resources, making it\nimpractical for common resource-limited devices. In this paper, we introduce\nSFPrompt, an innovative privacy-preserving fine-tuning method tailored for the\nfederated setting where direct uploading of raw data is prohibited and local\ndevices are resource-constrained to run a complete pre-trained model. In\nessence, SFPrompt judiciously combines split learning with federated learning\nto handle these challenges. Specifically, the pre-trained model is first\npartitioned into client and server components, thereby streamlining the\nclient-side model and substantially alleviating computational demands on local\nresources. SFPrompt then introduces soft prompts into the federated model to\nenhance the fine-tuning performance. To further reduce communication costs, a\nnovel dataset pruning algorithm and a local-loss update strategy are devised\nduring the fine-tuning process. Extensive experiments demonstrate that SFPrompt\ndelivers competitive performance as the federated full fine-tuning approach\nwhile consuming a mere 0.46% of local computing resources and incurring 53%\nless communication cost.\n","authors":["Linxiao Cao","Yifei Zhu","Wei Gong"],"pdf_url":"https://arxiv.org/pdf/2407.17533v1.pdf","comment":null}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2407.17432v1","updated":"2024-07-24T17:06:21Z","published":"2024-07-24T17:06:21Z","title":"An FPGA-Based Open-Source Hardware-Software Framework for Side-Channel\n Security Research","summary":" Attacks based on side-channel analysis (SCA) pose a severe security threat to\nmodern computing platforms, further exacerbated on IoT devices by their\npervasiveness and handling of private and critical data. Designing\nSCA-resistant computing platforms requires a significant additional effort in\nthe early stages of the IoT devices' life cycle, which is severely constrained\nby strict time-to-market deadlines and tight budgets. This manuscript\nintroduces a hardware-software framework meant for SCA research on FPGA\ntargets. It delivers an IoT-class system-on-chip (SoC) that includes a RISC-V\nCPU, provides observability and controllability through an ad-hoc debug\ninfrastructure to facilitate SCA attacks and evaluate the platform's security,\nand streamlines the deployment of SCA countermeasures through dedicated\nhardware and software features such as a DFS actuator and FreeRTOS support. The\nopen-source release of the framework includes the SoC, the scripts to configure\nthe computing platform, compile a target application, and assess the SCA\nsecurity, as well as a suite of state-of-the-art SCA attacks and\ncountermeasures. The goal is to foster its adoption and novel developments in\nthe field, empowering designers and researchers to focus on studying SCA\ncountermeasures and attacks while relying on a sound and stable\nhardware-software platform as the foundation for their research.\n","authors":["Davide Zoni","Andrea Galimberti","Davide Galli"],"pdf_url":"https://arxiv.org/pdf/2407.17432v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.17311v1","updated":"2024-07-24T14:28:24Z","published":"2024-07-24T14:28:24Z","title":"The Magnificent Seven Challenges and Opportunities in Domain-Specific\n Accelerator Design for Autonomous Systems","summary":" The end of Moore's Law and Dennard Scaling has combined with advances in\nagile hardware design to foster a golden age of domain-specific acceleration.\nHowever, this new frontier of computing opportunities is not without pitfalls.\nAs computer architects approach unfamiliar domains, we have seen common themes\nemerge in the challenges that can hinder progress in the development of useful\nacceleration. In this work, we present the Magnificent Seven Challenges in\ndomain-specific accelerator design that can guide adventurous architects to\ncontribute meaningfully to novel application domains. Although these challenges\nappear across domains ranging from ML to genomics, we examine them through the\nlens of autonomous systems as a motivating example in this work. To that end,\nwe identify opportunities for the path forward in a successful domain-specific\naccelerator design from these challenges.\n","authors":["Sabrina M. Neuman","Brian Plancher","Vijay Janapa Reddi"],"pdf_url":"https://arxiv.org/pdf/2407.17311v1.pdf","comment":"Presented at DAC 2024"},{"id":"http://arxiv.org/abs/2407.17647v1","updated":"2024-07-24T21:24:56Z","published":"2024-07-24T21:24:56Z","title":"An Energy-Efficient Artefact Detection Accelerator on FPGAs for\n Hyper-Spectral Satellite Imagery","summary":" Hyper-Spectral Imaging (HSI) is a crucial technique for analysing remote\nsensing data acquired from Earth observation satellites. The rich spatial and\nspectral information obtained through HSI allows for better characterisation\nand exploration of the Earth's surface over traditional techniques like RGB and\nMulti-Spectral imaging on the downlinked image data at ground stations.\nSometimes, these images do not contain meaningful information due to the\npresence of clouds or other artefacts, limiting their usefulness. Transmission\nof such artefact HSI images leads to wasteful use of already scarce energy and\ntime costs required for communication. While detecting such artefacts before\ntransmitting the HSI image is desirable, the computational complexity of these\nalgorithms and the limited power budget on satellites (especially CubeSats) are\nkey constraints. This paper presents an unsupervised learning-based\nconvolutional autoencoder (CAE) model for artefact identification of acquired\nHSI images at the satellite and a deployment architecture on AMD's Zynq\nUltrascale FPGAs. The model is trained and tested on widely used HSI image\ndatasets: Indian Pines, Salinas Valley, the University of Pavia and the Kennedy\nSpace Center. For deployment, the model is quantised to 8-bit precision,\nfine-tuned using the Vitis-AI framework and integrated as a subordinate\naccelerator using AMD's Deep-Learning Processing Units (DPU) instance on the\nZynq device. Our tests show that the model can process each spectral band in an\nHSI image in 4 ms, 2.6x better than INT8 inference on Nvidia's Jetson platform\n& 1.27x better than SOTA artefact detectors. Our model also achieves an\nf1-score of 92.8% and FPR of 0% across the dataset, while consuming 21.52 mJ\nper HSI image, 3.6x better than INT8 Jetson inference & 7.5x better than SOTA\nartefact detectors, making it a viable architecture for deployment in CubeSats.\n","authors":["Cornell Castelino","Shashwat Khandelwal","Shanker Shreejith","Sharatchandra Varma Bogaraju"],"pdf_url":"https://arxiv.org/pdf/2407.17647v1.pdf","comment":null}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2403.14606v2","updated":"2024-07-24T16:56:17Z","published":"2024-03-21T17:55:16Z","title":"The Elements of Differentiable Programming","summary":" Artificial intelligence has recently experienced remarkable advances, fueled\nby large models, vast datasets, accelerated hardware, and, last but not least,\nthe transformative power of differentiable programming. This new programming\nparadigm enables end-to-end differentiation of complex computer programs\n(including those with control flows and data structures), making gradient-based\noptimization of program parameters possible. As an emerging paradigm,\ndifferentiable programming builds upon several areas of computer science and\napplied mathematics, including automatic differentiation, graphical models,\noptimization and statistics. This book presents a comprehensive review of the\nfundamental concepts useful for differentiable programming. We adopt two main\nperspectives, that of optimization and that of probability, with clear\nanalogies between the two. Differentiable programming is not merely the\ndifferentiation of programs, but also the thoughtful design of programs\nintended for differentiation. By making programs differentiable, we inherently\nintroduce probability distributions over their execution, providing a means to\nquantify the uncertainty associated with program outputs.\n","authors":["Mathieu Blondel","Vincent Roulet"],"pdf_url":"https://arxiv.org/pdf/2403.14606v2.pdf","comment":"Draft version 2"},{"id":"http://arxiv.org/abs/2309.01261v2","updated":"2024-07-24T03:10:07Z","published":"2023-09-03T19:54:14Z","title":"Worst-Case Input Generation for Concurrent Programs under Non-Monotone\n Resource Metrics","summary":" Worst-case input generation aims to automatically generate inputs that\nexhibit the worst-case performance of programs. It has several applications,\nand can, for example, detect vulnerabilities to denial-of-service attacks.\nHowever, it is non-trivial to generate worst-case inputs for concurrent\nprograms, particularly for resources like memory where the peak cost depends on\nhow processes are scheduled.\n This article presents the first sound worst-case input generation algorithm\nfor concurrent programs under non-monotone resource metrics like memory. The\nkey insight is to leverage resource-annotated session types and symbolic\nexecution. Session types describe communication protocols on channels in\nprocess calculi. Equipped with resource annotations, resource-annotated session\ntypes not only encode cost bounds but also indicate how many resources can be\nreused and transferred between processes. This information is critical for\nidentifying a worst-case execution path during symbolic execution. The\nalgorithm is sound: if it returns any input, it is guaranteed to be a valid\nworst-case input. The algorithm is also relatively complete: as long as\nresource-annotated session types are sufficiently expressive and the background\ntheory for SMT solving is decidable, a worst-case input is guaranteed to be\nreturned. A simple case study of a web server's memory usage demonstrates the\nutility of the worst-case input generation algorithm.\n","authors":["Long Pham","Jan Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2309.01261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.06757v7","updated":"2024-07-24T18:21:58Z","published":"2021-01-17T19:24:46Z","title":"Higher Order Automatic Differentiation of Higher Order Functions","summary":" We present semantic correctness proofs of automatic differentiation (AD). We\nconsider a forward-mode AD method on a higher order language with algebraic\ndata types, and we characterise it as the unique structure preserving macro\ngiven a choice of derivatives for basic operations. We describe a rich\nsemantics for differentiable programming, based on diffeological spaces. We\nshow that it interprets our language, and we phrase what it means for the AD\nmethod to be correct with respect to this semantics. We show that our\ncharacterisation of AD gives rise to an elegant semantic proof of its\ncorrectness based on a gluing construction on diffeological spaces. We explain\nhow this is, in essence, a logical relations argument. Throughout, we show how\nthe analysis extends to AD methods for computing higher order derivatives using\na Taylor approximation.\n","authors":["Mathieu Huot","Sam Staton","Matthijs Vákár"],"pdf_url":"https://arxiv.org/pdf/2101.06757v7.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2001.02209"}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2407.17207v1","updated":"2024-07-24T12:06:37Z","published":"2024-07-24T12:06:37Z","title":"Solving The Travelling Salesman Problem Using A Single Qubit","summary":" The travelling salesman problem (TSP) is a popular NP-hard-combinatorial\noptimization problem that requires finding the optimal way for a salesman to\ntravel through different cities once and return to the initial city. The\nexisting methods of solving TSPs on quantum systems are either gate-based or\nbinary variable-based encoding. Both approaches are resource-expensive in terms\nof the number of qubits while performing worse compared to existing classical\nalgorithms even for small-size problems. We present an algorithm that solves an\narbitrary TSP using a single qubit by invoking the principle of quantum\nparallelism. The cities are represented as quantum states on the Bloch sphere\nwhile the preparation of superposition states allows us to traverse multiple\npaths at once. The underlying framework of our algorithm is a quantum version\nof the classical Brachistochrone approach. Optimal control methods are employed\nto create a selective superposition of the quantum states to find the shortest\nroute of a given TSP. The numerical simulations solve a sample of four to nine\ncities for which exact solutions are obtained. The algorithm can be implemented\non any quantum platform capable of efficiently rotating a qubit and allowing\nstate tomography measurements. For the TSP problem sizes considered in this\nwork, our algorithm is more resource-efficient and accurate than existing\nquantum algorithms with the potential for scalability. A potential speed-up of\npolynomial time over classical algorithms is discussed.\n","authors":["Kapil Goswami","Gagan Anekonda Veereshi","Peter Schmelcher","Rick Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2407.17207v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2203.15260v2","updated":"2024-07-24T11:21:47Z","published":"2022-03-29T06:15:54Z","title":"Efficient Convex Optimization Requires Superlinear Memory","summary":" We show that any memory-constrained, first-order algorithm which minimizes\n$d$-dimensional, $1$-Lipschitz convex functions over the unit ball to\n$1/\\mathrm{poly}(d)$ accuracy using at most $d^{1.25 - \\delta}$ bits of memory\nmust make at least $\\tilde{\\Omega}(d^{1 + (4/3)\\delta})$ first-order queries\n(for any constant $\\delta \\in [0, 1/4]$). Consequently, the performance of such\nmemory-constrained algorithms are a polynomial factor worse than the optimal\n$\\tilde{O}(d)$ query bound for this problem obtained by cutting plane methods\nthat use $\\tilde{O}(d^2)$ memory. This resolves a COLT 2019 open problem of\nWoodworth and Srebro.\n","authors":["Annie Marsden","Vatsal Sharan","Aaron Sidford","Gregory Valiant"],"pdf_url":"https://arxiv.org/pdf/2203.15260v2.pdf","comment":"33 pages, 1 figure"}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2407.17623v1","updated":"2024-07-24T20:29:52Z","published":"2024-07-24T20:29:52Z","title":"SAfEPaTh: A System-Level Approach for Efficient Power and Thermal\n Estimation of Convolutional Neural Network Accelerator","summary":" The design of energy-efficient, high-performance, and reliable Convolutional\nNeural Network (CNN) accelerators involves significant challenges due to\ncomplex power and thermal management issues. This paper introduces SAfEPaTh, a\nnovel system-level approach for accurately estimating power and temperature in\ntile-based CNN accelerators. By addressing both steady-state and\ntransient-state scenarios, SAfEPaTh effectively captures the dynamic effects of\npipeline bubbles in interlayer pipelines, utilizing real CNN workloads for\ncomprehensive evaluation. Unlike traditional methods, it eliminates the need\nfor circuit-level simulations or on-chip measurements. Our methodology\nleverages TANIA, a cutting-edge hybrid digital-analog tile-based accelerator\nfeaturing analog-in-memory computing cores alongside digital cores. Through\nrigorous simulation results using the ResNet18 model, we demonstrate SAfEPaTh's\ncapability to accurately estimate power and temperature within 500 seconds,\nencompassing CNN model accelerator mapping exploration and detailed power and\nthermal estimations. This efficiency and accuracy make SAfEPaTh an invaluable\ntool for designers, enabling them to optimize performance while adhering to\nstringent power and thermal constraints. Furthermore, SAfEPaTh's adaptability\nextends its utility across various CNN models and accelerator architectures,\nunderscoring its broad applicability in the field. This study contributes\nsignificantly to the advancement of energy-efficient and reliable CNN\naccelerator designs, addressing critical challenges in dynamic power and\nthermal management.\n","authors":["Yukai Chen","Simei Yang","Debjyoti Bhattacharjee","Francky Catthoor","Arindam Mallik"],"pdf_url":"https://arxiv.org/pdf/2407.17623v1.pdf","comment":null}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2407.17641v1","updated":"2024-07-24T21:09:22Z","published":"2024-07-24T21:09:22Z","title":"Regular language quantum states","summary":" We introduce regular language states, a family of quantum many-body states.\nThey are built from a special class of formal languages, called regular, which\nhas been thoroughly studied in the field of computer science. They can be\nunderstood as the superposition of all the words in a regular language and\nencompass physically relevant states such as the GHZ-, W- or Dicke-states. By\nleveraging the theory of regular languages, we develop a theoretical framework\nto describe them. First, we express them in terms of matrix product states,\nproviding efficient criteria to recognize them. We then develop a canonical\nform which allows us to formulate a fundamental theorem for the equivalence of\nregular language states, including under local unitary operations. We also\nexploit the theory of tensor networks to find an efficient criterion to\ndetermine when regular languages are shift-invariant.\n","authors":["Marta Florido-Llinàs","Álvaro M. Alhambra","David Pérez-García","J. Ignacio Cirac"],"pdf_url":"https://arxiv.org/pdf/2407.17641v1.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2407.17537v1","updated":"2024-07-24T08:35:50Z","published":"2024-07-24T08:35:50Z","title":"A process algebraic framework for multi-agent dynamic epistemic systems","summary":" This paper combines the classical model of labeled transition systems with\nthe epistemic model for reasoning about knowledge. The result is a unifying\nframework for modeling and analyzing multi-agent, knowledge-based, dynamic\nsystems. On the modeling side, we propose a process algebraic, agent-oriented\nspecification language that makes such a framework easy to use for practical\npurposes. On the verification side, we define a modal logic encompassing\ntemporal and epistemic operators.\n","authors":["Alessandro Aldini"],"pdf_url":"https://arxiv.org/pdf/2407.17537v1.pdf","comment":null}]},"2024-07-25T00:00:00Z":{"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2407.12736v3","updated":"2024-07-25T00:00:18Z","published":"2024-07-17T16:56:06Z","title":"CHOSEN: Compilation to Hardware Optimization Stack for Efficient Vision\n Transformer Inference","summary":" Vision Transformers (ViTs) represent a groundbreaking shift in machine\nlearning approaches to computer vision. Unlike traditional approaches, ViTs\nemploy the self-attention mechanism, which has been widely used in natural\nlanguage processing, to analyze image patches. Despite their advantages in\nmodeling visual tasks, deploying ViTs on hardware platforms, notably\nField-Programmable Gate Arrays (FPGAs), introduces considerable challenges.\nThese challenges stem primarily from the non-linear calculations and high\ncomputational and memory demands of ViTs. This paper introduces CHOSEN, a\nsoftware-hardware co-design framework to address these challenges and offer an\nautomated framework for ViT deployment on the FPGAs in order to maximize\nperformance. Our framework is built upon three fundamental contributions:\nmulti-kernel design to maximize the bandwidth, mainly targeting benefits of\nmulti DDR memory banks, approximate non-linear functions that exhibit minimal\naccuracy degradation, and efficient use of available logic blocks on the FPGA,\nand efficient compiler to maximize the performance and memory-efficiency of the\ncomputing kernels by presenting a novel algorithm for design space exploration\nto find optimal hardware configuration that achieves optimal throughput and\nlatency. Compared to the state-of-the-art ViT accelerators, CHOSEN achieves a\n1.5x and 1.42x improvement in the throughput on the DeiT-S and DeiT-B models.\n","authors":["Mohammad Erfan Sadeghi","Arash Fayyazi","Suhas Somashekar","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2407.12736v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19094v2","updated":"2024-07-25T15:55:15Z","published":"2024-06-27T11:22:46Z","title":"Understanding the Security Benefits and Overheads of Emerging Industry\n Solutions to DRAM Read Disturbance","summary":" We present the first rigorous security, performance, energy, and cost\nanalyses of the state-of-the-art on-DRAM-die read disturbance mitigation\nmethod, Per Row Activation Counting (PRAC), described in JEDEC DDR5\nspecification's April 2024 update. Unlike prior state-of-the-art that advises\nthe memory controller to periodically issue refresh management (RFM) commands,\nwhich provides the DRAM chip with time to perform refreshes, PRAC introduces a\nnew back-off signal. PRAC's back-off signal propagates from the DRAM chip to\nthe memory controller and forces the memory controller to 1) stop serving\nrequests and 2) issue RFM commands. As a result, RFM commands are issued when\nneeded as opposed to periodically, reducing RFM's overheads. We analyze PRAC in\nfour steps. First, we define an adversarial access pattern that represents the\nworst-case for PRAC's security. Second, we investigate PRAC's configurations\nand security implications. Our analyses show that PRAC can be configured for\nsecure operation as long as no bitflip occurs before accessing a memory\nlocation 10 times. Third, we evaluate the performance impact of PRAC and\ncompare it against prior works using Ramulator 2.0. Our analysis shows that\nwhile PRAC incurs less than 13% performance overhead for today's DRAM chips,\nits performance overheads can reach up to 94% for future DRAM chips that are\nmore vulnerable to read disturbance bitflips. Fourth, we define an availability\nadversarial access pattern that exacerbates PRAC's performance overhead to\nperform a memory performance attack, demonstrating that such an adversarial\npattern can hog up to 94% of DRAM throughput and degrade system throughput by\nup to 95%. We discuss PRAC's implications on future systems and foreshadow\nfuture research directions. To aid future research, we open-source our\nimplementations and scripts at https://github.com/CMU-SAFARI/ramulator2.\n","authors":["Oğuzhan Canpolat","A. Giray Yağlıkçı","Geraldo F. Oliveira","Ataberk Olgun","Oğuz Ergin","Onur Mutlu"],"pdf_url":"https://arxiv.org/pdf/2406.19094v2.pdf","comment":"To appear in DRAMSec 2024"},{"id":"http://arxiv.org/abs/2407.18209v1","updated":"2024-07-25T17:18:28Z","published":"2024-07-25T17:18:28Z","title":"SuperFlow: A Fully-Customized RTL-to-GDS Design Automation Flow for\n Adiabatic Quantum-Flux-Parametron Superconducting Circuits","summary":" Superconducting circuits, like Adiabatic Quantum-Flux-Parametron (AQFP),\noffer exceptional energy efficiency but face challenges in physical design due\nto sophisticated spacing and timing constraints. Current design tools often\nneglect the importance of constraint adherence throughout the entire design\nflow. In this paper, we propose SuperFlow, a fully-customized RTL-to-GDS design\nflow tailored for AQFP devices. SuperFlow leverages a synthesis tool based on\nCMOS technology to transform any input RTL netlist to an AQFP-based netlist.\nSubsequently, we devise a novel place-and-route procedure that simultaneously\nconsiders wirelength, timing, and routability for AQFP circuits. The process\nculminates in the generation of the AQFP circuit layout, followed by a Design\nRule Check (DRC) to identify and rectify any layout violations. Our\nexperimental results demonstrate that SuperFlow achieves 12.8% wirelength\nimprovement on average and 12.1% better timing quality compared with previous\nstate-of-the-art placers for AQFP circuits.\n","authors":["Yanyue Xie","Peiyan Dong","Geng Yuan","Zhengang Li","Masoud Zabihi","Chao Wu","Sung-En Chang","Xufeng Zhang","Xue Lin","Caiwen Ding","Nobuyuki Yoshikawa","Olivia Chen","Yanzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2407.18209v1.pdf","comment":"Accepted by DATE 2024"},{"id":"http://arxiv.org/abs/2407.18110v1","updated":"2024-07-25T15:18:47Z","published":"2024-07-25T15:18:47Z","title":"MapTune: Advancing ASIC Technology Mapping via Reinforcement Learning\n Guided Library Tuning","summary":" Technology mapping involves mapping logical circuits to a library of cells.\nTraditionally, the full technology library is used, leading to a large search\nspace and potential overhead. Motivated by randomly sampled technology mapping\ncase studies, we propose MapTune framework that addresses this challenge by\nutilizing reinforcement learning to make design-specific choices during cell\nselection. By learning from the environment, MapTune refines the cell selection\nprocess, resulting in a reduced search space and potentially improved mapping\nquality.\n The effectiveness of MapTune is evaluated on a wide range of benchmarks,\ndifferent technology libraries and technology mappers. The experimental results\ndemonstrate that MapTune achieves higher mapping accuracy and reducing\ndelay/area across diverse circuit designs, technology libraries and mappers.\nThe paper also discusses the Pareto-Optimal exploration and confirms the\nperpetual delay-area trade-off. Conducted on benchmark suites ISCAS 85/89,\nITC/ISCAS 99, VTR8.0 and EPFL benchmarks, the post-technology mapping and\npost-sizing quality-of-results (QoR) have been significantly improved, with\naverage Area-Delay Product (ADP) improvement of 22.54\\% among all different\nexploration settings in MapTune. The improvements are consistently remained for\nfour different technologies (7nm, 45nm, 130nm, and 180 nm) and two different\nmappers.\n","authors":["Mingju Liu","Daniel Robinson","Yingjie Li","Cunxi Yu"],"pdf_url":"https://arxiv.org/pdf/2407.18110v1.pdf","comment":"IEEE/ACM International Conference on Computer-Aided Design (ICCAD\n '24), October 27--31, 2024"},{"id":"http://arxiv.org/abs/2407.17879v1","updated":"2024-07-25T08:47:40Z","published":"2024-07-25T08:47:40Z","title":"HG-PIPE: Vision Transformer Acceleration with Hybrid-Grained Pipeline","summary":" Vision Transformer (ViT) acceleration with field programmable gate array\n(FPGA) is promising but challenging. Existing FPGA-based ViT accelerators\nmainly rely on temporal architectures, which process different operators by\nreusing the same hardware blocks and suffer from extensive memory access\noverhead. Pipelined architectures, either coarse-grained or fine-grained,\nunroll the ViT computation spatially for memory access efficiency. However,\nthey usually suffer from significant hardware resource constraints and pipeline\nbubbles induced by the global computation dependency of ViT. In this paper, we\nintroduce HG-PIPE, a pipelined FPGA accelerator for high-throughput and\nlow-latency ViT processing. HG-PIPE features a hybrid-grained pipeline\narchitecture to reduce on-chip buffer cost and couples the computation dataflow\nand parallelism design to eliminate the pipeline bubbles. HG-PIPE further\nintroduces careful approximations to implement both linear and non-linear\noperators with abundant Lookup Tables (LUTs), thus alleviating resource\nconstraints. On a ZCU102 FPGA, HG-PIPE achieves 2.78 times better throughput\nand 2.52 times better resource efficiency than the prior-art accelerators,\ne.g., AutoViTAcc. With a VCK190 FPGA, HG-PIPE realizes end-to-end ViT\nacceleration on a single device and achieves 7118 images/s, which is 2.81 times\nfaster than a V100 GPU.\n","authors":["Qingyu Guo","Jiayong Wan","Songqiang Xu","Meng Li","Yuan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.17879v1.pdf","comment":"Accepted by ICCAD 2024"},{"id":"http://arxiv.org/abs/2407.17790v1","updated":"2024-07-25T05:52:48Z","published":"2024-07-25T05:52:48Z","title":"Exploring the Limitations of Kolmogorov-Arnold Networks in\n Classification: Insights to Software Training and Hardware Implementation","summary":" Kolmogorov-Arnold Networks (KANs), a novel type of neural network, have\nrecently gained popularity and attention due to the ability to substitute\nmulti-layer perceptions (MLPs) in artificial intelligence (AI) with higher\naccuracy and interoperability. However, KAN assessment is still limited and\ncannot provide an in-depth analysis of a specific domain. Furthermore, no study\nhas been conducted on the implementation of KANs in hardware design, which\nwould directly demonstrate whether KANs are truly superior to MLPs in practical\napplications. As a result, in this paper, we focus on verifying KANs for\nclassification issues, which are a common but significant topic in AI using\nfour different types of datasets. Furthermore, the corresponding hardware\nimplementation is considered using the Vitis high-level synthesis (HLS) tool.\nTo the best of our knowledge, this is the first article to implement hardware\nfor KAN. The results indicate that KANs cannot achieve more accuracy than MLPs\nin high complex datasets while utilizing substantially higher hardware\nresources. Therefore, MLP remains an effective approach for achieving accuracy\nand efficiency in software and hardware implementation.\n","authors":["an Duy Tran","Tran Xuan Hieu Le","Thi Diem Tran","Hoai Luan Pham","Vu Trung Duong Le","Tuan Hai Vu","Van Tinh Nguyen","Yasuhiko Nakashima"],"pdf_url":"https://arxiv.org/pdf/2407.17790v1.pdf","comment":"6 pages, 3 figures, 2 tables"}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2407.17276v2","updated":"2024-07-25T05:06:44Z","published":"2024-07-24T13:42:46Z","title":"SoK: Bridging Trust into the Blockchain. A Systematic Review on On-Chain\n Identity","summary":" The ongoing regulation of blockchain-based services and applications requires\nthe identification of users who are issuing transactions on the blockchain.\nThis systematic review explores the current status, identifies research gaps,\nand outlines future research directions for establishing trusted and\nprivacy-compliant identities on the blockchain (on-chain identity). A\nsystematic search term was applied across various scientific databases,\ncollecting 2232 potentially relevant research papers. These papers were\nnarrowed down in two methodologically executed steps to 98 and finally to 13\nrelevant sources. The relevant articles were then systematically analyzed based\non a set of screening questions. The results of the selected studies have\nprovided insightful findings on the mechanisms of on-chain identities. On-chain\nidentities are established using zero-knowledge proofs, public key\ninfrastructure/certificates, and web of trust approaches. The technologies and\narchitectures used by the authors are also highlighted. Trust has emerged as a\nkey research gap, manifesting in two ways: firstly, a gap in how to trust the\ndigital identity representation of a physical human; secondly, a gap in how to\ntrust identity providers that issue identity confirmations on-chain. Potential\nfuture research avenues are suggested to help fill the current gaps in\nestablishing trust and on-chain identities.\n","authors":["Awid Vaziry","Kaustabh Barman","Patrick Herbke"],"pdf_url":"https://arxiv.org/pdf/2407.17276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18202v1","updated":"2024-07-25T17:11:00Z","published":"2024-07-25T17:11:00Z","title":"Differentiable Quantum Architecture Search in Asynchronous Quantum\n Reinforcement Learning","summary":" The emergence of quantum reinforcement learning (QRL) is propelled by\nadvancements in quantum computing (QC) and machine learning (ML), particularly\nthrough quantum neural networks (QNN) built on variational quantum circuits\n(VQC). These advancements have proven successful in addressing sequential\ndecision-making tasks. However, constructing effective QRL models demands\nsignificant expertise due to challenges in designing quantum circuit\narchitectures, including data encoding and parameterized circuits, which\nprofoundly influence model performance. In this paper, we propose addressing\nthis challenge with differentiable quantum architecture search (DiffQAS),\nenabling trainable circuit parameters and structure weights using\ngradient-based optimization. Furthermore, we enhance training efficiency\nthrough asynchronous reinforcement learning (RL) methods facilitating parallel\ntraining. Through numerical simulations, we demonstrate that our proposed\nDiffQAS-QRL approach achieves performance comparable to manually-crafted\ncircuit architectures across considered environments, showcasing stability\nacross diverse scenarios. This methodology offers a pathway for designing QRL\nmodels without extensive quantum knowledge, ensuring robust performance and\nfostering broader application of QRL.\n","authors":["Samuel Yen-Chi Chen"],"pdf_url":"https://arxiv.org/pdf/2407.18202v1.pdf","comment":"Accepted by IEEE International Conference on Quantum Computing and\n Engineering - QCE 2024"},{"id":"http://arxiv.org/abs/2407.18200v1","updated":"2024-07-25T17:09:22Z","published":"2024-07-25T17:09:22Z","title":"Sparse Incremental Aggregation in Multi-Hop Federated Learning","summary":" This paper investigates federated learning (FL) in a multi-hop communication\nsetup, such as in constellations with inter-satellite links. In this setup,\npart of the FL clients are responsible for forwarding other client's results to\nthe parameter server. Instead of using conventional routing, the communication\nefficiency can be improved significantly by using in-network model aggregation\nat each intermediate hop, known as incremental aggregation (IA). Prior works\n[1] have indicated diminishing gains for IA under gradient sparsification. Here\nwe study this issue and propose several novel correlated sparsification methods\nfor IA. Numerical results show that, for some of these algorithms, the full\npotential of IA is still available under sparsification without impairing\nconvergence. We demonstrate a 15x improvement in communication efficiency over\nconventional routing and a 11x improvement over state-of-the-art (SoA) sparse\nIA.\n","authors":["Sourav Mukherjee","Nasrin Razmi","Armin Dekorsy","Petar Popovski","Bho Matthiesen"],"pdf_url":"https://arxiv.org/pdf/2407.18200v1.pdf","comment":"This paper is accepted for the 25th IEEE International Workshop on\n Signal Processing Advances in Wireless Communications (SPAWC) conference"},{"id":"http://arxiv.org/abs/2407.18148v1","updated":"2024-07-25T15:58:56Z","published":"2024-07-25T15:58:56Z","title":"StraightLine: An End-to-End Resource-Aware Scheduler for Machine\n Learning Application Requests","summary":" The life cycle of machine learning (ML) applications consists of two stages:\nmodel development and model deployment. However, traditional ML systems (e.g.,\ntraining-specific or inference-specific systems) focus on one particular stage\nor phase of the life cycle of ML applications. These systems often aim at\noptimizing model training or accelerating model inference, and they frequently\nassume homogeneous infrastructure, which may not always reflect real-world\nscenarios that include cloud data centers, local servers, containers, and\nserverless platforms. We present StraightLine, an end-to-end resource-aware\nscheduler that schedules the optimal resources (e.g., container, virtual\nmachine, or serverless) for different ML application requests in a hybrid\ninfrastructure. The key innovation is an empirical dynamic placing algorithm\nthat intelligently places requests based on their unique characteristics (e.g.,\nrequest frequency, input data size, and data distribution). In contrast to\nexisting ML systems, StraightLine offers end-to-end resource-aware placement,\nthereby it can significantly reduce response time and failure rate for model\ndeployment when facing different computing resources in the hybrid\ninfrastructure.\n","authors":["Cheng-Wei Ching","Boyuan Guan","Hailu Xu","Liting Hu"],"pdf_url":"https://arxiv.org/pdf/2407.18148v1.pdf","comment":"6 pages, 8 figures, to appear in AIoTC'24"},{"id":"http://arxiv.org/abs/2407.18031v1","updated":"2024-07-25T13:26:59Z","published":"2024-07-25T13:26:59Z","title":"$k$-Center Clustering in Distributed Models","summary":" The $k$-center problem is a central optimization problem with numerous\napplications for machine learning, data mining, and communication networks.\nDespite extensive study in various scenarios, it surprisingly has not been\nthoroughly explored in the traditional distributed setting, where the\ncommunication graph of a network also defines the distance metric.\n We initiate the study of the $k$-center problem in a setting where the\nunderlying metric is the graph's shortest path metric in three canonical\ndistributed settings: the LOCAL, CONGEST, and CLIQUE models. Our results\nencompass constant-factor approximation algorithms and lower bounds in these\nmodels, as well as hardness results for the bi-criteria approximation setting.\n","authors":["Leyla Biabani","Ami Paz"],"pdf_url":"https://arxiv.org/pdf/2407.18031v1.pdf","comment":"Presented in SIROCCO'24 conference"},{"id":"http://arxiv.org/abs/2405.14413v2","updated":"2024-07-25T13:00:26Z","published":"2024-05-23T10:34:21Z","title":"GeoFaaS: An Edge-to-Cloud FaaS Platform","summary":" The massive growth of mobile and IoT devices demands geographically\ndistributed computing systems for optimal performance, privacy, and\nscalability. However, existing edge-to-cloud serverless platforms lack location\nawareness, resulting in inefficient network usage and increased latency.\n In this paper, we propose GeoFaaS, a novel edge-to-cloud\nFunction-as-a-Service (FaaS) platform that leverages real-time client location\ninformation for transparent request execution on the nearest available FaaS\nnode. If needed, GeoFaaS transparently offloads requests to the cloud when edge\nresources are overloaded, thus, ensuring consistent execution without user\nintervention. GeoFaaS has a modular and decentralized architecture: building on\nthe single-node FaaS system tinyFaaS, GeoFaaS works as a stand-alone\nedge-to-cloud FaaS platform but can also integrate and act as a routing layer\nfor existing FaaS services, e.g., in the cloud. To evaluate our approach, we\nimplemented an open-source proof-of-concept prototype and studied performance\nand fault-tolerance behavior in experiments.\n","authors":["Mohammadreza Malekabbasi","Tobias Pfandzelter","Trever Schirmer","David Bermbach"],"pdf_url":"https://arxiv.org/pdf/2405.14413v2.pdf","comment":"Accepted for publication in 12th IEEE International Conference on\n Cloud Engineering (IC2E 2024)"},{"id":"http://arxiv.org/abs/2407.18004v1","updated":"2024-07-25T12:59:59Z","published":"2024-07-25T12:59:59Z","title":"Optimal Broadcast Schedules in Logarithmic Time with Applications to\n Broadcast, All-Broadcast, Reduction and All-Reduction","summary":" We give optimally fast $O(\\log p)$ time (per processor) algorithms for\ncomputing round-optimal broadcast schedules for message-passing parallel\ncomputing systems. This affirmatively answers difficult questions posed in a\nSPAA 2022 BA and a CLUSTER 2022 paper. We observe that the computed schedules\nand circulant communication graph can likewise be used for reduction,\nall-broadcast and all-reduction as well, leading to new, round-optimal\nalgorithms for these problems. These observations affirmatively answer open\nquestions posed in a CLUSTER 2023 paper.\n The problem is to broadcast $n$ indivisible blocks of data from a given root\nprocessor to all other processors in a (subgraph of a) fully connected network\nof $p$ processors with fully bidirectional, one-ported communication\ncapabilities. In this model, $n-1+\\lceil\\log_2 p\\rceil$ communication rounds\nare required. Our new algorithms compute for each processor in the network\nreceive and send schedules each of size $\\lceil\\log_2 p\\rceil$ that determine\nuniquely in $O(1)$ time for each communication round the new block that the\nprocessor will receive, and the already received block it has to send. Schedule\ncomputations are done independently per processor without communication. The\nbroadcast communication subgraph is an easily computable, directed,\n$\\lceil\\log_2 p\\rceil$-regular circulant graph also used elsewhere. We show how\nthe schedule computations can be done in optimal time and space of $O(\\log p)$,\nimproving significantly over previous results of $O(p\\log^2 p)$ and $O(\\log^3\np)$, respectively. The schedule computation and broadcast algorithms are simple\nto implement, but correctness and complexity are not obvious. The schedules are\nused for new implementations of the MPI (Message-Passing Interface) collectives\nMPI_Bcast, MPI_Allgatherv, MPI_Reduce and MPI_Reduce_scatter. Preliminary\nexperimental results are given.\n","authors":["Jesper Larsson Träff"],"pdf_url":"https://arxiv.org/pdf/2407.18004v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2312.11236"},{"id":"http://arxiv.org/abs/2401.14351v2","updated":"2024-07-25T08:08:11Z","published":"2024-01-25T17:55:07Z","title":"ServerlessLLM: Low-Latency Serverless Inference for Large Language\n Models","summary":" This paper presents ServerlessLLM, a distributed system designed to support\nlow-latency serverless inference for Large Language Models (LLMs). By\nharnessing the substantial near-GPU storage and memory capacities of inference\nservers, ServerlessLLM achieves effective local checkpoint storage, minimizing\nthe need for remote checkpoint downloads and ensuring efficient checkpoint\nloading. The design of ServerlessLLM features three core contributions: (i)\n\\emph{fast multi-tier checkpoint loading}, featuring a new loading-optimized\ncheckpoint format and a multi-tier loading system, fully utilizing the\nbandwidth of complex storage hierarchies on GPU servers; (ii) \\emph{efficient\nlive migration of LLM inference}, which enables newly initiated inferences to\ncapitalize on local checkpoint storage while ensuring minimal user\ninterruption; and (iii) \\emph{startup-time-optimized model scheduling}, which\nassesses the locality statuses of checkpoints on each server and schedules the\nmodel onto servers that minimize the time to start the inference. Comprehensive\nevaluations, including microbenchmarks and real-world scenarios, demonstrate\nthat ServerlessLLM dramatically outperforms state-of-the-art serverless\nsystems, reducing latency by 10 - 200X across various LLM inference workloads.\n","authors":["Yao Fu","Leyang Xue","Yeqi Huang","Andrei-Octavian Brabete","Dmitrii Ustiugov","Yuvraj Patel","Luo Mai"],"pdf_url":"https://arxiv.org/pdf/2401.14351v2.pdf","comment":"18th USENIX Symposium on Operating Systems Design and Implementation"},{"id":"http://arxiv.org/abs/2404.09302v2","updated":"2024-07-25T06:05:54Z","published":"2024-04-14T16:57:41Z","title":"High Significant Fault Detection in Azure Core Workload Insights","summary":" Azure Core workload insights have time-series data with different metric\nunits. Faults or Anomalies are observed in these time-series data owing to\nfaults observed with respect to metric name, resources region, dimensions, and\nits dimension value associated with the data. For Azure Core, an important task\nis to highlight faults or anomalies to the user on a dashboard that they can\nperceive easily. The number of anomalies reported should be highly significant\nand in a limited number, e.g., 5-20 anomalies reported per hour. The reported\nanomalies will have significant user perception and high reconstruction error\nin any time-series forecasting model. Hence, our task is to automatically\nidentify 'high significant anomalies' and their associated information for user\nperception.\n","authors":["Pranay Lohia","Laurent Boue","Sharath Rangappa","Vijay Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2404.09302v2.pdf","comment":"Published in IAAI 2024, which is the Industrial track of AAAI 2024"},{"id":"http://arxiv.org/abs/2407.17754v1","updated":"2024-07-25T04:09:12Z","published":"2024-07-25T04:09:12Z","title":"DualFed: Enjoying both Generalization and Personalization in Federated\n Learning via Hierachical Representations","summary":" In personalized federated learning (PFL), it is widely recognized that\nachieving both high model generalization and effective personalization poses a\nsignificant challenge due to their conflicting nature. As a result, existing\nPFL methods can only manage a trade-off between these two objectives. This\nraises an interesting question: Is it feasible to develop a model capable of\nachieving both objectives simultaneously? Our paper presents an affirmative\nanswer, and the key lies in the observation that deep models inherently exhibit\nhierarchical architectures, which produce representations with various levels\nof generalization and personalization at different stages. A straightforward\napproach stemming from this observation is to select multiple representations\nfrom these layers and combine them to concurrently achieve generalization and\npersonalization. However, the number of candidate representations is commonly\nhuge, which makes this method infeasible due to high computational costs.To\naddress this problem, we propose DualFed, a new method that can directly yield\ndual representations correspond to generalization and personalization\nrespectively, thereby simplifying the optimization task. Specifically, DualFed\ninserts a personalized projection network between the encoder and classifier.\nThe pre-projection representations are able to capture generalized information\nshareable across clients, and the post-projection representations are effective\nto capture task-specific information on local clients. This design minimizes\nthe mutual interference between generalization and personalization, thereby\nachieving a win-win situation. Extensive experiments show that DualFed can\noutperform other FL methods. Code is available at\nhttps://github.com/GuogangZhu/DualFed.\n","authors":["Guogang Zhu","Xuefeng Liu","Jianwei Niu","Shaojie Tang","Xinghao Wu","Jiayuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.17754v1.pdf","comment":"Accepted by ACM MutltiMedia 2024"},{"id":"http://arxiv.org/abs/2407.17699v1","updated":"2024-07-25T01:46:49Z","published":"2024-07-25T01:46:49Z","title":"SOK: Blockchain for Provenance","summary":" Provenance, which traces data from its creation to manipulation, is crucial\nfor ensuring data integrity, reliability, and trustworthiness. It is valuable\nfor single-user applications, collaboration within organizations, and across\norganizations. Blockchain technology has become a popular choice for\nimplementing provenance due to its distributed, transparent, and immutable\nnature. Numerous studies on blockchain designs are specifically dedicated to\nprovenance, and specialize in this area. Our goal is to provide a new\nperspective in blockchain based provenance field by identifying the challenges\nfaced and suggesting future research directions. In this paper, we categorize\nthe problem statement into three main research questions to investigate key\nissues comprehensively and propose a new outlook on the use of blockchains. The\nfirst focuses on challenges in non-collaborative, single-source environments,\nthe second examines implications in collaborative environments and different\ndomains such as supply chain, scientific collaboration and digital forensic,\nand the last one analyzes communication and data exchange challenges between\norganizations using different blockchains. The interconnected nature of these\nresearch questions ensures a thorough exploration of provenance requirements,\nleading to more effective and secure systems. After analyzing the requirements\nof provenance in different environments, we provide future design\nconsiderations for provenance-based blockchains, including blockchain type,\nquery mechanisms, provenance capture methods, and domain-specific\nconsiderations. We also discuss future work and possible extensions in this\nfield.\n","authors":["Asma Jodeiri Akbarfam","Hoda Maleki"],"pdf_url":"https://arxiv.org/pdf/2407.17699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17676v1","updated":"2024-07-25T00:00:46Z","published":"2024-07-25T00:00:46Z","title":"Empowering the Quantum Cloud User with QRIO","summary":" Quantum computing is moving swiftly from theoretical to practical\napplications, making it crucial to establish a significant quantum advantage.\nDespite substantial investments, access to quantum devices is still limited,\nwith users facing issues like long wait times and inefficient resource\nmanagement. Unlike the mature cloud solutions for classical computing, quantum\ncomputing lacks effective infrastructure for resource optimization. We propose\na Quantum Resource Infrastructure Orchestrator (QRIO), a state-of-the-art cloud\nresource manager built on Kubernetes that is tailored to quantum computing.\nQRIO seeks to democratize access to quantum devices by providing customizable,\nuser-friendly, open-source resource management. QRIO's design aims to ensure\nequitable access, optimize resource utilization, and support diverse\napplications, thereby speeding up innovation and making quantum computing more\naccessible and efficient to a broader user base. In this paper, we discuss\nQRIO's various features and evaluate its capability in several representative\nusecases.\n","authors":["Shmeelok Chakraborty","Isaac Hou","Gokul S. Ravi","Ang Chen"],"pdf_url":"https://arxiv.org/pdf/2407.17676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15560v2","updated":"2024-07-25T21:02:34Z","published":"2024-06-21T18:00:10Z","title":"How to Rent GPUs on a Budget","summary":" The explosion in Machine Learning (ML) over the past ten years has led to a\ndramatic increase in demand for GPUs to train ML models. Because it is\nprohibitively expensive for most users to build and maintain a large GPU\ncluster, large cloud providers (Microsoft Azure, Amazon AWS, Google Cloud) have\nseen explosive growth in demand for renting cloud-based GPUs. In this\ncloud-computing paradigm, a user must specify their demand for GPUs at every\nmoment in time, and will pay for every GPU-hour they use. ML training jobs are\nknown to be parallelizable to different degrees. Given a stream of ML training\njobs, a user typically wants to minimize the mean response time across all\njobs. Here, the response time of a job denotes the time from when a job arrives\nuntil it is complete. Additionally, the user is constrained by some operating\nbudget. Specifically, in this paper the user is constrained to use no more than\n$b$ GPUs per hour, over a long-run time average. The question is how to\nminimize mean response time while meeting the budget constraint. Because\ntraining jobs receive a diminishing marginal benefit from running on additional\nGPUs, allocating too many GPUs to a single training job can dramatically\nincrease the overall cost paid by the user. Hence, an optimal rental policy\nmust balance a tradeoff between training cost and mean response time. This\npaper derives the optimal rental policy for a stream of training jobs where the\njobs have different levels of parallelizability (specified by a speedup\nfunction) and different job sizes (amounts of inherent work). We make almost no\nassumptions about the arrival process and about the job size distribution. Our\noptimal policy specifies how many GPUs to rent at every moment in time and how\nto allocate these GPUs.\n","authors":["Zhouzi Li","Benjamin Berg","Arpan Mukhopadhyay","Mor Harchol-Balter"],"pdf_url":"https://arxiv.org/pdf/2406.15560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18387v1","updated":"2024-07-25T20:42:16Z","published":"2024-07-25T20:42:16Z","title":"SCALE: Self-regulated Clustered federAted LEarning in a Homogeneous\n Environment","summary":" Federated Learning (FL) has emerged as a transformative approach for enabling\ndistributed machine learning while preserving user privacy, yet it faces\nchallenges like communication inefficiencies and reliance on centralized\ninfrastructures, leading to increased latency and costs. This paper presents a\nnovel FL methodology that overcomes these limitations by eliminating the\ndependency on edge servers, employing a server-assisted Proximity Evaluation\nfor dynamic cluster formation based on data similarity, performance indices,\nand geographical proximity. Our integrated approach enhances operational\nefficiency and scalability through a Hybrid Decentralized Aggregation Protocol,\nwhich merges local model training with peer-to-peer weight exchange and a\ncentralized final aggregation managed by a dynamically elected driver node,\nsignificantly curtailing global communication overhead. Additionally, the\nmethodology includes Decentralized Driver Selection, Check-pointing to reduce\nnetwork traffic, and a Health Status Verification Mechanism for system\nrobustness. Validated using the breast cancer dataset, our architecture not\nonly demonstrates a nearly tenfold reduction in communication overhead but also\nshows remarkable improvements in reducing training latency and energy\nconsumption while maintaining high learning performance, offering a scalable,\nefficient, and privacy-preserving solution for the future of federated learning\necosystems.\n","authors":["Sai Puppala","Ismail Hossain","Md Jahangir Alam","Sajedul Talukder","Zahidur Talukder","Syed Bahauddin"],"pdf_url":"https://arxiv.org/pdf/2407.18387v1.pdf","comment":"This research article got accepted in COMPSAC conference and going to\n be published to IEEE"},{"id":"http://arxiv.org/abs/2407.18386v1","updated":"2024-07-25T20:40:43Z","published":"2024-07-25T20:40:43Z","title":"Leveraging Core and Uncore Frequency Scaling for Power-Efficient\n Serverless Workflows","summary":" Serverless workflows have emerged in FaaS platforms to represent the\noperational structure of traditional applications. With latency propagation\neffects becoming increasingly prominent, step-wise resource tuning is required\nto address the end-to-end Quality-of-Service (QoS) requirements. Modern\nprocessors' allowance for fine-grained Dynamic Voltage and Frequency Scaling\n(DVFS), coupled with the intermittent nature of serverless workflows presents a\nunique opportunity to reduce power while meeting QoS.\n In this paper, we introduce a QoS-aware DVFS framework for serverless\nworkflows. {\\Omega}kypous regulates the end-to-end latency of serverless\nworkflows by supplying the system with the Core/Uncore frequency combination\nthat minimizes power consumption. With Uncore DVFS enriching the efficient\npower configurations space, we devise a grey-box model that accurately projects\nfunctions' execution latency and power, to the applied Core and Uncore\nfrequency combination. To the best of our knowledge, {\\Omega}kypous is the\nfirst work that leverages Core and Uncore DVFS as an integral part of\nserverless workflows. Our evaluation on the analyzed Azure Trace, against\nstate-of-the-art (SotA) power managers, demonstrates an average power\nconsumption reduction of 9% (up to 21%) while minimizing QoS violations.\n","authors":["Achilleas Tzenetopoulos","Dimosthenis Masouros","Sotirios Xydis","Dimitrios Soudris"],"pdf_url":"https://arxiv.org/pdf/2407.18386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18365v1","updated":"2024-07-25T20:02:57Z","published":"2024-07-25T20:02:57Z","title":"FADAS: Towards Federated Adaptive Asynchronous Optimization","summary":" Federated learning (FL) has emerged as a widely adopted training paradigm for\nprivacy-preserving machine learning. While the SGD-based FL algorithms have\ndemonstrated considerable success in the past, there is a growing trend towards\nadopting adaptive federated optimization methods, particularly for training\nlarge-scale models. However, the conventional synchronous aggregation design\nposes a significant challenge to the practical deployment of those adaptive\nfederated optimization methods, particularly in the presence of straggler\nclients. To fill this research gap, this paper introduces federated adaptive\nasynchronous optimization, named FADAS, a novel method that incorporates\nasynchronous updates into adaptive federated optimization with provable\nguarantees. To further enhance the efficiency and resilience of our proposed\nmethod in scenarios with significant asynchronous delays, we also extend FADAS\nwith a delay-adaptive learning adjustment strategy. We rigorously establish the\nconvergence rate of the proposed algorithms and empirical results demonstrate\nthe superior performance of FADAS over other asynchronous FL baselines.\n","authors":["Yujia Wang","Shiqiang Wang","Songtao Lu","Jinghui Chen"],"pdf_url":"https://arxiv.org/pdf/2407.18365v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2407.18358v1","updated":"2024-07-25T19:43:49Z","published":"2024-07-25T19:43:49Z","title":"Generative AI like ChatGPT in Blockchain Federated Learning: use cases,\n opportunities and future","summary":" Federated learning has become a significant approach for training machine\nlearning models using decentralized data without necessitating the sharing of\nthis data. Recently, the incorporation of generative artificial intelligence\n(AI) methods has provided new possibilities for improving privacy, augmenting\ndata, and customizing models. This research explores potential integrations of\ngenerative AI in federated learning, revealing various opportunities to enhance\nprivacy, data efficiency, and model performance. It particularly emphasizes the\nimportance of generative models like generative adversarial networks (GANs) and\nvariational autoencoders (VAEs) in creating synthetic data that replicates the\ndistribution of real data. Generating synthetic data helps federated learning\naddress challenges related to limited data availability and supports robust\nmodel development. Additionally, we examine various applications of generative\nAI in federated learning that enable more personalized solutions.\n","authors":["Sai Puppala","Ismail Hossain","Md Jahangir Alam","Sajedul Talukder","Jannatul Ferdaus","Mahedi Hasan","Sameera Pisupati","Shanmukh Mathukumilli"],"pdf_url":"https://arxiv.org/pdf/2407.18358v1.pdf","comment":"We are going to submit this research article into a conference which\n is best fit for this topic"},{"id":"http://arxiv.org/abs/2407.18352v1","updated":"2024-07-25T19:38:52Z","published":"2024-07-25T19:38:52Z","title":"HPAC-ML: A Programming Model for Embedding ML Surrogates in Scientific\n Applications","summary":" The end of Dennard scaling and the slowdown of Moore's Law led to\nheterogeneous architectures benefiting machine learning (ML) algorithms. These\nhardware advancements and the development of intuitive domain-specific\nlanguages have made ML more accessible, democratizing innovation. ML models\nsurpass traditional approximation limits, broadening opportunities and evolving\nfrom statistical to complex function modeling. Consequently, scientific\napplications leverage ML models for enhanced execution speeds. However,\nintegrating ML models remains manual and complex, slowing the adoption of ML as\nan approximation technique in modern applications.\n We propose an easy-to-use directive-based programming model that enables\ndevelopers to describe the use of ML models in scientific applications. The\nruntime support, as instructed by the programming model, performs data\nassimilation using the original algorithm and can replace the algorithm with\nmodel inference. Our evaluation across five benchmarks, testing over 5000 ML\nmodels, shows up to 83.6x speed improvements with minimal accuracy loss (as low\nas 0.01 RMSE).\n","authors":["Zane Fink","Konstantinos Parasyris","Praneet Rathi","Giorgis Georgakoudis","Harshitha Menon","Peer-Timo Bremer"],"pdf_url":"https://arxiv.org/pdf/2407.18352v1.pdf","comment":"16 pages, 9 figures. Accepted at SC24"}],"Operation Systems":[{"id":"http://arxiv.org/abs/2405.00078v2","updated":"2024-07-25T08:21:42Z","published":"2024-04-30T12:34:23Z","title":"VeriFence: Lightweight and Precise Spectre Defenses for Untrusted Linux\n Kernel Extensions","summary":" High-performance IO demands low-overhead communication between user- and\nkernel space. This demand can no longer be fulfilled by traditional system\ncalls. Linux's extended Berkeley Packet Filter (BPF) avoids user-/kernel\ntransitions by just-in-time compiling user-provided bytecode and executing it\nin kernel mode with near-native speed. To still isolate BPF programs from the\nkernel, they are statically analyzed for memory- and type-safety, which imposes\nsome restrictions but allows for good expressiveness and high performance.\nHowever, to mitigate the Spectre vulnerabilities disclosed in 2018, defenses\nwhich reject potentially-dangerous programs had to be deployed. We find that\nthis affects 31% to 54% of programs in a dataset with 844 real-world BPF\nprograms from popular open-source projects. To solve this, users are forced to\ndisable the defenses to continue using the programs, which puts the entire\nsystem at risk.\n To enable secure and expressive untrusted Linux kernel extensions, we propose\nVeriFence, an enhancement to the kernel's Spectre defenses that reduces the\nnumber of BPF application programs rejected from 54% to zero. We measure\nVeriFence's overhead for all mainstream performance-sensitive applications of\nBPF (i.e., event tracing, profiling, and packet processing) and find that it\nimproves significantly upon the status-quo where affected BPF programs are\neither unusable or enable transient execution attacks on the kernel.\n","authors":["Luis Gerhorst","Henriette Herzog","Peter Wägemann","Maximilian Ott","Rüdiger Kapitza","Timo Hönig"],"pdf_url":"https://arxiv.org/pdf/2405.00078v2.pdf","comment":"RAID'24"},{"id":"http://arxiv.org/abs/2407.18431v1","updated":"2024-07-25T23:46:27Z","published":"2024-07-25T23:46:27Z","title":"Rusty Linux: Advances in Rust for Linux Kernel Development","summary":" Context: The integration of Rust into kernel development is a transformative\nendeavor aimed at enhancing system security and reliability by leveraging\nRust's strong memory safety guarantees. Objective: We aim to find the current\nadvances in using Rust in Kernel development to reduce the number of memory\nsafety vulnerabilities in one of the most critical pieces of software that\nunderpins all modern applications. Method: By analyzing a broad spectrum of\nstudies, we identify the advantages Rust offers, highlight the challenges\nfaced, and emphasize the need for community consensus on Rust's adoption.\nResults: Our findings suggest that while the initial implementations of Rust in\nthe kernel show promising results in terms of safety and stability, significant\nchallenges remain. These challenges include achieving seamless interoperability\nwith existing kernel components, maintaining performance, and ensuring adequate\nsupport and tooling for developers. Conclusions: This study underscores the\nneed for continued research and practical implementation efforts to fully\nrealize the benefits of Rust. By addressing these challenges, the integration\nof Rust could mark a significant step forward in the evolution of operating\nsystem development towards safer and more reliable systems\n","authors":["Shane K. Panter","Nasir U. Eisty"],"pdf_url":"https://arxiv.org/pdf/2407.18431v1.pdf","comment":"This paper has been accepted for publication and presentation at ESEM\n 2024 Emerging Results, Vision and Reflection Papers Track to be held in\n Barcelona, Spain on October 24-25, 2024"},{"id":"http://arxiv.org/abs/2407.18306v1","updated":"2024-07-25T18:00:08Z","published":"2024-07-25T18:00:08Z","title":"Design and demonstration of an operating system for executing\n applications on quantum network nodes","summary":" The goal of future quantum networks is to enable new internet applications\nthat are impossible to achieve using solely classical communication. Up to now,\ndemonstrations of quantum network applications and functionalities on quantum\nprocessors have been performed in ad-hoc software that was specific to the\nexperimental setup, programmed to perform one single task (the application\nexperiment) directly into low-level control devices using expertise in\nexperimental physics. Here, we report on the design and implementation of the\nfirst architecture capable of executing quantum network applications on quantum\nprocessors in platform-independent high-level software. We demonstrate the\narchitecture's capability to execute applications in high-level software, by\nimplementing it as a quantum network operating system -- QNodeOS -- and\nexecuting test programs including a delegated computation from a client to a\nserver on two quantum network nodes based on nitrogen-vacancy (NV) centers in\ndiamond. We show how our architecture allows us to maximize the use of quantum\nnetwork hardware, by multitasking different applications on a quantum network\nfor the first time. Our architecture can be used to execute programs on any\nquantum processor platform corresponding to our system model, which we\nillustrate by demonstrating an additional driver for QNodeOS for a trapped-ion\nquantum network node based on a single $^{40}\\text{Ca}^+$ atom. Our\narchitecture lays the groundwork for computer science research in the domain of\nquantum network programming, and paves the way for the development of software\nthat can bring quantum network technology to society.\n","authors":["Carlo Delle Donne","Mariagrazia Iuliano","Bart van der Vecht","Guilherme Maciel Ferreira","Hana Jirovská","Thom van der Steenhoven","Axel Dahlberg","Matt Skrzypczyk","Dario Fioretto","Markus Teller","Pavel Filippov","Alejandro Rodríguez-Pardo Montblanch","Julius Fischer","Benjamin van Ommen","Nicolas Demetriou","Dominik Leichtle","Luka Music","Harold Ollivier","Ingmar te Raa","Wojciech Kozlowski","Tim Taminiau","Przemysław Pawełczak","Tracy Northup","Ronald Hanson","Stephanie Wehner"],"pdf_url":"https://arxiv.org/pdf/2407.18306v1.pdf","comment":"12 pages, 5 figures, supplementary materials (48 pages, 24 figures,\n 11 tables)"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2407.18220v1","updated":"2024-07-25T17:36:18Z","published":"2024-07-25T17:36:18Z","title":"Detecting and explaining (in)equivalence of context-free grammars","summary":" We propose a scalable framework for deciding, proving, and explaining\n(in)equivalence of context-free grammars. We present an implementation of the\nframework and evaluate it on large data sets collected within educational\nsupport systems. Even though the equivalence problem for context-free languages\nis undecidable in general, the framework is able to handle a large portion of\nthese datasets. It introduces and combines techniques from several areas, such\nas an abstract grammar transformation language to identify equivalent grammars\nas well as sufficiently similar inequivalent grammars, theory-based comparison\nalgorithms for a large class of context-free languages, and a\ngraph-theory-inspired grammar canonization that allows to efficiently identify\nisomorphic grammars.\n","authors":["Marko Schmellenkamp","Thomas Zeume","Sven Argo","Sandra Kiefer","Cedric Siems","Fynn Stebel"],"pdf_url":"https://arxiv.org/pdf/2407.18220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17853v1","updated":"2024-07-25T08:14:33Z","published":"2024-07-25T08:14:33Z","title":"Compilation of Commit Changes within Java Source Code Repositories","summary":" Java applications include third-party dependencies as bytecode. To keep these\napplications secure, researchers have proposed tools to re-identify\ndependencies that contain known vulnerabilities. Yet, to allow such\nre-identification, one must obtain, for each vulnerability patch, the bytecode\nfixing the respective vulnerability at first. Such patches for dependencies are\ncurated in databases in the form of fix-commits. But fixcommits are in source\ncode, and automatically compiling whole Java projects to bytecode is\nnotoriously hard, particularly for non-current versions of the code. In this\npaper, we thus propose JESS, an approach that largely avoids this problem by\ncompiling solely the relevant code that was modified within a given commit.\nJESS reduces the code, retaining only those parts that the committed change\nreferences. To avoid name-resolution errors, JESS automatically infers stubs\nfor references to entities that are unavailable to the compiler. A challenge is\nhere that, to facilitate the above mentioned reidentification, JESS must seek\nto produce bytecode that is almost identical to the bytecode which one would\nobtain by a successful compilation of the full project. An evaluation on 347\nGitHub projects shows that JESS is able to compile, in isolation, 72% of\nmethods and constructors, of which 89% have bytecode equal to the original one.\nFurthermore, on the Project KB database of fix-commits, in which only 8% of\nfiles modified within the commits can be compiled with the provided build\nscripts, JESS is able to compile 73% of all files that these commits modify.\n","authors":["Stefan Schott","Wolfram Fischer","Serena Elisa Ponta","Jonas Klauke","Eric Bodden"],"pdf_url":"https://arxiv.org/pdf/2407.17853v1.pdf","comment":"To be published in: ICSME 2024 Proceedings"},{"id":"http://arxiv.org/abs/2307.00146v3","updated":"2024-07-25T00:24:11Z","published":"2023-06-30T21:34:52Z","title":"Bluefish: A Relational Framework for Graphic Representations","summary":" Diagrams are essential tools for problem-solving and communication as they\nexternalize conceptual structures using spatial relationships. But when picking\na diagramming framework, users are faced with a dilemma. They can either use a\nhighly expressive but low-level toolkit, whose API does not match their\ndomain-specific concepts, or select a high-level typology, which offers a\nrecognizable vocabulary but supports a limited range of diagrams. To address\nthis gap, we introduce Bluefish: a diagramming framework inspired by\ncomponent-based user interface (UI) libraries. Bluefish lets users create\ndiagrams using relations: declarative, composable, and extensible diagram\nfragments that relax the concept of a UI component. Unlike a component, a\nrelation does not have sole ownership over its children nor does it need to\nfully specify their layout. To render diagrams, Bluefish extends a traditional\ntree-based scenegraph to a compound graph that captures both hierarchical and\nadjacent relationships between nodes. To evaluate our system, we construct a\ndiverse example gallery covering many domains including mathematics, physics,\ncomputer science, and even cooking. We show that Bluefish's relations are\neffective declarative primitives for diagrams. Bluefish is open source, and we\naim to shape it into both a usable tool and a research platform.\n","authors":["Josh Pollock","Catherine Mei","Grace Huang","Elliot Evans","Daniel Jackson","Arvind Satyanarayan"],"pdf_url":"https://arxiv.org/pdf/2307.00146v3.pdf","comment":"27 pages, 14 figures"}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2402.05981v2","updated":"2024-07-25T13:37:16Z","published":"2024-02-08T08:02:57Z","title":"Anatomizing Deep Learning Inference in Web Browsers","summary":" Web applications have increasingly adopted Deep Learning (DL) through\nin-browser inference, wherein DL inference performs directly within Web\nbrowsers. The actual performance of in-browser inference and its impacts on the\nquality of experience (QoE) remain unexplored, and urgently require new QoE\nmeasurements beyond traditional ones, e.g., mainly focusing on page load time.\nTo bridge this gap, we make the first comprehensive performance measurement of\nin-browser inference to date. Our approach proposes new metrics to measure\nin-browser inference: responsiveness, smoothness, and inference accuracy. Our\nextensive analysis involves 9 representative DL models across Web browsers of\n50 popular PC devices and 20 mobile devices. The results reveal that in-browser\ninference exhibits a substantial latency gap, averaging 16.9 times slower on\nCPU and 4.9 times slower on GPU compared to native inference on PC devices. The\ngap on mobile CPU and mobile GPU is 15.8 times and 7.8 times, respectively.\nFurthermore, we identify contributing factors to such latency gap, including\nunderutilized hardware instruction sets, inherent overhead in the runtime\nenvironment, resource contention within the browser, and inefficiencies in\nsoftware libraries and GPU abstractions. Additionally, in-browser inference\nimposes significant memory demands, at times exceeding 334.6 times the size of\nthe DL models themselves, partly attributable to suboptimal memory management.\nWe also observe that in-browser inference leads to a significant 67.2% increase\nin the time it takes for GUI components to render within Web browsers,\nsignificantly affecting the overall user QoE of Web applications reliant on\nthis technology\n","authors":["Qipeng Wang","Shiqi Jiang","Zhenpeng Chen","Xu Cao","Yuanchun Li","Aoyu Li","Yun Ma","Ting Cao","Xuanzhe Liu"],"pdf_url":"https://arxiv.org/pdf/2402.05981v2.pdf","comment":"Accepted by ACM Transactions on Software Engineering and Methodology\n (TOSEM)"},{"id":"http://arxiv.org/abs/2402.02750v2","updated":"2024-07-25T09:16:05Z","published":"2024-02-05T06:06:47Z","title":"KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache","summary":" Efficiently serving large language models (LLMs) requires batching of many\nrequests to reduce the cost per request. Yet, with larger batch sizes and\nlonger context lengths, the key-value (KV) cache, which stores attention keys\nand values to avoid re-computations, significantly increases memory demands and\nbecomes the new bottleneck in speed and memory usage. Additionally, the loading\nof the KV cache causes the computational core to be idle, which limits the\ninference speed. A straightforward and effective solution to reduce KV cache\nsize is quantization, which decreases the total bytes taken by KV cache.\nHowever, there is a lack of in-depth studies that explore the element\ndistribution of KV cache to understand the hardness and limitation of KV cache\nquantization. To fill the gap, we conducted a comprehensive study on the\nelement distribution in KV cache of popular LLMs. Our findings indicate that\nthe key cache should be quantized per-channel, i.e., group elements along the\nchannel dimension and quantize them together. In contrast, the value cache\nshould be quantized per-token. From this analysis, we developed a tuning-free\n2bit KV cache quantization algorithm named KIVI. With hardware-friendly\nimplementation, KIVI can enable Llama, Falcon, and Mistral models to maintain\nalmost the same quality while using $\\mathbf{2.6\\times}$ less peak memory\n(including model weight). This reduction in memory usage enables up to\n$\\mathbf{4\\times}$ larger batch size, bringing $\\mathbf{2.35\\times \\sim\n3.47\\times}$ throughput on real LLM inference workload. The source code is\navailable at https://github.com/jy-yuan/KIVI.\n","authors":["Zirui Liu","Jiayi Yuan","Hongye Jin","Shaochen Zhong","Zhaozhuo Xu","Vladimir Braverman","Beidi Chen","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2402.02750v2.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2406.15560v2","updated":"2024-07-25T21:02:34Z","published":"2024-06-21T18:00:10Z","title":"How to Rent GPUs on a Budget","summary":" The explosion in Machine Learning (ML) over the past ten years has led to a\ndramatic increase in demand for GPUs to train ML models. Because it is\nprohibitively expensive for most users to build and maintain a large GPU\ncluster, large cloud providers (Microsoft Azure, Amazon AWS, Google Cloud) have\nseen explosive growth in demand for renting cloud-based GPUs. In this\ncloud-computing paradigm, a user must specify their demand for GPUs at every\nmoment in time, and will pay for every GPU-hour they use. ML training jobs are\nknown to be parallelizable to different degrees. Given a stream of ML training\njobs, a user typically wants to minimize the mean response time across all\njobs. Here, the response time of a job denotes the time from when a job arrives\nuntil it is complete. Additionally, the user is constrained by some operating\nbudget. Specifically, in this paper the user is constrained to use no more than\n$b$ GPUs per hour, over a long-run time average. The question is how to\nminimize mean response time while meeting the budget constraint. Because\ntraining jobs receive a diminishing marginal benefit from running on additional\nGPUs, allocating too many GPUs to a single training job can dramatically\nincrease the overall cost paid by the user. Hence, an optimal rental policy\nmust balance a tradeoff between training cost and mean response time. This\npaper derives the optimal rental policy for a stream of training jobs where the\njobs have different levels of parallelizability (specified by a speedup\nfunction) and different job sizes (amounts of inherent work). We make almost no\nassumptions about the arrival process and about the job size distribution. Our\noptimal policy specifies how many GPUs to rent at every moment in time and how\nto allocate these GPUs.\n","authors":["Zhouzi Li","Benjamin Berg","Arpan Mukhopadhyay","Mor Harchol-Balter"],"pdf_url":"https://arxiv.org/pdf/2406.15560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18387v1","updated":"2024-07-25T20:42:16Z","published":"2024-07-25T20:42:16Z","title":"SCALE: Self-regulated Clustered federAted LEarning in a Homogeneous\n Environment","summary":" Federated Learning (FL) has emerged as a transformative approach for enabling\ndistributed machine learning while preserving user privacy, yet it faces\nchallenges like communication inefficiencies and reliance on centralized\ninfrastructures, leading to increased latency and costs. This paper presents a\nnovel FL methodology that overcomes these limitations by eliminating the\ndependency on edge servers, employing a server-assisted Proximity Evaluation\nfor dynamic cluster formation based on data similarity, performance indices,\nand geographical proximity. Our integrated approach enhances operational\nefficiency and scalability through a Hybrid Decentralized Aggregation Protocol,\nwhich merges local model training with peer-to-peer weight exchange and a\ncentralized final aggregation managed by a dynamically elected driver node,\nsignificantly curtailing global communication overhead. Additionally, the\nmethodology includes Decentralized Driver Selection, Check-pointing to reduce\nnetwork traffic, and a Health Status Verification Mechanism for system\nrobustness. Validated using the breast cancer dataset, our architecture not\nonly demonstrates a nearly tenfold reduction in communication overhead but also\nshows remarkable improvements in reducing training latency and energy\nconsumption while maintaining high learning performance, offering a scalable,\nefficient, and privacy-preserving solution for the future of federated learning\necosystems.\n","authors":["Sai Puppala","Ismail Hossain","Md Jahangir Alam","Sajedul Talukder","Zahidur Talukder","Syed Bahauddin"],"pdf_url":"https://arxiv.org/pdf/2407.18387v1.pdf","comment":"This research article got accepted in COMPSAC conference and going to\n be published to IEEE"}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2407.18201v1","updated":"2024-07-25T17:09:26Z","published":"2024-07-25T17:09:26Z","title":"Semi-Classical Subspaces, The No Synchronization Law, and More","summary":" This paper looks at the intersection of algorithmic information theory and\nphysics, namely quantum mechanics, thermodynamics, and black holes. We discuss\ntheorems which characterize the barrier between the quantum world and the\nclassical realm. The notion of a \"semi-classical subspace\" is introduced. The\nNo Synchronization Law is detailed, which says separate and isolated physical\nsystems evolving over time cannot have thermodynamic algorithmic entropies that\nare in synch. We look at future work involving the Kolmogorov complexity of\nblack holes.\n","authors":["Samuel Epstein"],"pdf_url":"https://arxiv.org/pdf/2407.18201v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2402.13049"},{"id":"http://arxiv.org/abs/2401.00947v3","updated":"2024-07-25T14:13:59Z","published":"2024-01-01T20:08:58Z","title":"On SAT information content, its polynomial-time solvability and fixed\n code algorithms","summary":" The amount of information in satisfiability problem (SAT) is considered. SAT\ncan be polynomial-time solvable when the solving algorithm holds an exponential\namount of information. It is also established that SAT Kolmogorov complexity is\nconstant. It is argued that the amount of information in SAT grows at least\nexponentially with the size of the input instance. The amount of information in\nSAT is compared with the amount of information in the fixed code algorithms and\ngenerated over runtime.\n","authors":["Maciej Drozdowski"],"pdf_url":"https://arxiv.org/pdf/2401.00947v3.pdf","comment":"16 pages, 1 table, 0 figures, new content, rewriting arguments,\n corrected typos"},{"id":"http://arxiv.org/abs/2407.18006v1","updated":"2024-07-25T13:02:06Z","published":"2024-07-25T13:02:06Z","title":"The Existential Theory of the Reals as a Complexity Class: A Compendium","summary":" We survey the complexity class $\\exists \\mathbb{R}$, which captures the\ncomplexity of deciding the existential theory of the reals. The class $\\exists\n\\mathbb{R}$ has roots in two different traditions, one based on the\nBlum-Shub-Smale model of real computation, and the other following work by\nMn\\\"{e}v and Shor on the universality of realization spaces of oriented\nmatroids. Over the years the number of problems for which $\\exists \\mathbb{R}$\nrather than NP has turned out to be the proper way of measuring their\ncomplexity has grown, particularly in the fields of computational geometry,\ngraph drawing, game theory, and some areas in logic and algebra. $\\exists\n\\mathbb{R}$ has also started appearing in the context of machine learning,\nMarkov decision processes, and probabilistic reasoning.\n We have aimed at collecting a comprehensive compendium of problems complete\nand hard for $\\exists \\mathbb{R}$, as well as a long list of open problems. The\ncompendium is presented in the third part of our survey; a tour through the\ncompendium and the areas it touches on makes up the second part. The first part\nintroduces the reader to the existential theory of the reals as a complexity\nclass, discussing its history, motivation and prospects as well as some\ntechnical aspects.\n","authors":["Marcus Schaefer","Jean Cardinal","Tillmann Miltzow"],"pdf_url":"https://arxiv.org/pdf/2407.18006v1.pdf","comment":"126 pages, 12 figures, 6 tables, about 150 complete problems and\n about 50 open problems"},{"id":"http://arxiv.org/abs/2407.17947v1","updated":"2024-07-25T11:07:28Z","published":"2024-07-25T11:07:28Z","title":"Supercritical Size-Width Tree-Like Resolution Trade-Offs for Graph\n Isomorphism","summary":" We study the refutation complexity of graph isomorphism in the tree-like\nresolution calculus. Tor\\'an and W\\\"orz (TOCL 2023) showed that there is a\nresolution refutation of narrow width $k$ for two graphs if and only if they\ncan be distinguished in ($k+1$)-variable first-order logic (FO$^{k+1}$) and\nhence by a count-free variant of the $k$-dimensional Weisfeiler-Leman\nalgorithm. While DAG-like narrow width $k$ resolution refutations have size at\nmost $n^k$, tree-like refutations may be much larger. We show that there are\ngraphs of order n, whose isomorphism can be refuted in narrow width $k$ but\nonly in tree-like size $2^{\\Omega(n^{k/2})}$. This is a supercritical trade-off\nwhere bounding one parameter (the narrow width) causes the other parameter (the\nsize) to grow above its worst case. The size lower bound is super-exponential\nin the formula size and improves a related supercritical width versus tree-like\nsize trade-off by Razborov (JACM 2016). To prove our result, we develop a new\nvariant of the $k$-pebble EF-game for FO$^k$ to reason about tree-like\nrefutation size in a similar way as the Prover-Delayer games in proof\ncomplexity. We analyze this game on a modified variant of the compressed CFI\ngraphs introduced by Grohe, Lichter, Neuen, and Schweitzer (FOCS 2023). Using a\nrecent improved robust compressed CFI construction of Janett, Nordstr\\\"om, and\nPang (unpublished manuscript), we obtain a similar bound for width $k$ (instead\nof the stronger but less common narrow width) and make the result more robust.\n","authors":["Christoph Berkholz","Moritz Lichter","Harry Vinall-Smeeth"],"pdf_url":"https://arxiv.org/pdf/2407.17947v1.pdf","comment":"32 pages, 2 figures"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2407.18220v1","updated":"2024-07-25T17:36:18Z","published":"2024-07-25T17:36:18Z","title":"Detecting and explaining (in)equivalence of context-free grammars","summary":" We propose a scalable framework for deciding, proving, and explaining\n(in)equivalence of context-free grammars. We present an implementation of the\nframework and evaluate it on large data sets collected within educational\nsupport systems. Even though the equivalence problem for context-free languages\nis undecidable in general, the framework is able to handle a large portion of\nthese datasets. It introduces and combines techniques from several areas, such\nas an abstract grammar transformation language to identify equivalent grammars\nas well as sufficiently similar inequivalent grammars, theory-based comparison\nalgorithms for a large class of context-free languages, and a\ngraph-theory-inspired grammar canonization that allows to efficiently identify\nisomorphic grammars.\n","authors":["Marko Schmellenkamp","Thomas Zeume","Sven Argo","Sandra Kiefer","Cedric Siems","Fynn Stebel"],"pdf_url":"https://arxiv.org/pdf/2407.18220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18131v1","updated":"2024-07-25T15:37:26Z","published":"2024-07-25T15:37:26Z","title":"Reachability for Multi-Priced Timed Automata with Positive and Negative\n Rates","summary":" Multi-priced timed automata (MPTA) are timed automata with observer\n variables whose derivatives can change from one location to another.\n Observers are write-only variables, that is, they do not affect the control\n flow of the automaton; thus MPTA lie between timed and hybrid\n automata in expressiveness. Previous work considered observers with\n non-negative slope in every location. In this paper we treat\n observers that have both positive and negative rates. Our\n main result is an algorithm to decide a gap version of the\n reachability problem for this variant of MPTA. We translate the\n gap reachability problem into a gap satisfiability problem for mixed\n integer-real systems of nonlinear constraints. Our main technical\n contribution -- a result of independent interest -- is a procedure\n to solve such contraints via a combination of branch-and-bound\n and relaxation-and-rounding.\n","authors":["Andrew Scoones","Mahsa Shirmohammadi","James Worrell"],"pdf_url":"https://arxiv.org/pdf/2407.18131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18090v1","updated":"2024-07-25T14:55:11Z","published":"2024-07-25T14:55:11Z","title":"On the Minimisation of Deterministic and History-Deterministic\n Generalised (co)Büchi Automata","summary":" We present a polynomial-time algorithm minimising the number of states of\nhistory-deterministic generalised coB\\\"uchi automata, building on the work of\nAbu Radi and Kupferman on coB\\\"uchi automata. On the other hand, we establish\nthat the minimisation problem for both deterministic and history-deterministic\ngeneralised B\\\"uchi automata is NP-complete, as well as the problem of\nminimising at the same time the number of states and colours of\nhistory-deterministic generalised coB\\\"uchi automata.\n","authors":["Antonio Casares","Olivier Idir","Denis Kuperberg","Corto Mascle","Aditya Prakash"],"pdf_url":"https://arxiv.org/pdf/2407.18090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03183v4","updated":"2024-07-25T13:42:35Z","published":"2023-04-06T16:08:38Z","title":"History-deterministic Timed Automata","summary":" We explore the notion of history-determinism in the context of timed automata\n(TA) over infinite timed words. History-deterministic (HD) automata are those\nin which nondeterminism can be resolved on the fly, based on the run\nconstructed thus far. History-determinism is a robust property that admits\ndifferent game-based characterisations, and HD specifications allow for\ngame-based verification without an expensive determinization step.\n We show that the class of timed $\\omega$-languages recognized by HD timed\nautomata strictly extends that of deterministic ones, and is strictly included\nin those recognised by fully non-deterministic TA.\n For non-deterministic timed automata it is known that universality is already\nundecidable for safety/reachability TA. For history-deterministic TA with\narbitrary parity acceptance, we show that timed universality, inclusion, and\nsynthesis all remain decidable and are EXPTIME-complete.\n For the subclass of TA with safety or reachability acceptance, one can decide\n(in EXPTIME) whether such an automaton is history-deterministic. If so, it can\neffectively determinized without introducing new automata states.\n","authors":["Sougata Bose","Thomas A. Henzinger","Karoliina Lehtinen","Sven Schewe","Patrick Totzke"],"pdf_url":"https://arxiv.org/pdf/2304.03183v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18006v1","updated":"2024-07-25T13:02:06Z","published":"2024-07-25T13:02:06Z","title":"The Existential Theory of the Reals as a Complexity Class: A Compendium","summary":" We survey the complexity class $\\exists \\mathbb{R}$, which captures the\ncomplexity of deciding the existential theory of the reals. The class $\\exists\n\\mathbb{R}$ has roots in two different traditions, one based on the\nBlum-Shub-Smale model of real computation, and the other following work by\nMn\\\"{e}v and Shor on the universality of realization spaces of oriented\nmatroids. Over the years the number of problems for which $\\exists \\mathbb{R}$\nrather than NP has turned out to be the proper way of measuring their\ncomplexity has grown, particularly in the fields of computational geometry,\ngraph drawing, game theory, and some areas in logic and algebra. $\\exists\n\\mathbb{R}$ has also started appearing in the context of machine learning,\nMarkov decision processes, and probabilistic reasoning.\n We have aimed at collecting a comprehensive compendium of problems complete\nand hard for $\\exists \\mathbb{R}$, as well as a long list of open problems. The\ncompendium is presented in the third part of our survey; a tour through the\ncompendium and the areas it touches on makes up the second part. The first part\nintroduces the reader to the existential theory of the reals as a complexity\nclass, discussing its history, motivation and prospects as well as some\ntechnical aspects.\n","authors":["Marcus Schaefer","Jean Cardinal","Tillmann Miltzow"],"pdf_url":"https://arxiv.org/pdf/2407.18006v1.pdf","comment":"126 pages, 12 figures, 6 tables, about 150 complete problems and\n about 50 open problems"},{"id":"http://arxiv.org/abs/2102.07401v2","updated":"2024-07-25T15:40:48Z","published":"2021-02-15T09:00:02Z","title":"Model-bounded monitoring of hybrid systems","summary":" Monitoring of hybrid systems attracts both scientific and practical\nattention. However, monitoring algorithms suffer from the methodological\ndifficulty of only observing sampled discrete-time signals, while real\nbehaviors are continuous-time signals. To mitigate this problem of sampling\nuncertainties, we introduce a model-bounded monitoring scheme, where we use\nprior knowledge about the target system to prune interpolation candidates.\nTechnically, we express such prior knowledge by linear hybrid automata (LHAs)\n-- the LHAs are called bounding models. We introduce a novel notion of\nmonitored language of LHAs, and we reduce the monitoring problem to the\nmembership problem of the monitored language. We present two partial algorithms\n-- one is via reduction to reachability in LHAs and the other is a direct one\nusing polyhedra -- and show that these methods, and thus the proposed\nmodel-bounded monitoring scheme, are efficient and practically relevant.\n","authors":["Masaki Waga","Étienne André","Ichiro Hasuo"],"pdf_url":"https://arxiv.org/pdf/2102.07401v2.pdf","comment":"This is the author version of the manuscript of the same name\n published in the ACM Transactions on Cyber-Physical Systems"},{"id":"http://arxiv.org/abs/1903.07328v2","updated":"2024-07-25T07:40:20Z","published":"2019-03-18T09:41:07Z","title":"Parametric Timed Pattern Matching","summary":" Given a log and a specification, timed pattern matching aims at exhibiting\nfor which start and end dates a specification holds on that log. For example,\n\"a given action is always followed by another action before a given deadline\".\nThis problem has strong connections with monitoring real-time systems. We\naddress here timed pattern matching in the presence of an uncertain\nspecification, i.e., that may contain timing parameters (e.g., the deadline can\nbe uncertain or unknown). We want to know for which start and end dates, and\nfor what values of the timing parameters, a property holds. For instance, we\nlook for the minimum or maximum deadline (together with the corresponding start\nand end dates) for which the property holds. We propose two frameworks for\nparametric timed pattern matching. The first one is based on parametric timed\nmodel checking. In contrast to most parametric timed problems, the solution is\neffectively computable. The second one is a dedicated method; not only we\nlargely improve the efficiency compared to the first method, but we further\npropose optimizations with skipping. Our experiment results suggest that our\nalgorithms, especially the second one, are efficient and practically relevant.\n","authors":["Masaki Waga","Étienne André","Ichiro Hasuo"],"pdf_url":"https://arxiv.org/pdf/1903.07328v2.pdf","comment":"This is the author version of the manuscript of the same name\n published in ACM Transactions on Software Engineering and Methodology (Volume\n 32, Issue 1, 2023). This manuscript is an extension of [ICECCS 2018, NFM\n 2019], with [ICECCS 2018] describing the first method of this manuscript\n (based on parametric timed model checking) while [NFM 2019] describes the\n second dedicated method. arXiv admin note: substantial text overlap with\n arXiv:1812.08940"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2407.18069v1","updated":"2024-07-25T14:24:57Z","published":"2024-07-25T14:24:57Z","title":"C2P: Featuring Large Language Models with Causal Reasoning","summary":" Causal reasoning is the primary bottleneck that Large Language Models (LLMs)\nmust overcome to attain human-level intelligence. To address this, we introduce\nthe Causal Chain of Prompting (C2P) as the first reasoning framework that\nequips current LLMs with causal reasoning capabilities. C2P operates\nautonomously, avoiding reliance on external tools or modules during both the\ncausal learning and reasoning phases, and can be seamlessly implemented during\nthe training or fine-tuning of LLMs. Experimental results across various\nbenchmark datasets demonstrate a significant improvement in causal learning and\nsubsequent reasoning accuracy of LLMs. We illustrate how C2P enhances LLMs'\nability to causally reason in real-world scenarios, addressing complex problems\nin fields such as healthcare, medicine, economics, education, social sciences,\nenvironmental science, and marketing. With few-shot learning, GPT-4 Turbo using\nC2P with as few as six examples achieves significant performance improvements,\nboasting over a 33% increase in reasoning accuracy over the most\nstate-of-the-art LLMs, which perform nearly randomly in similar circumstances.\nThis demonstrates the transformative potential of integrating C2P into LLM\ntraining or fine-tuning processes, thereby empowering these models with\nadvanced causal reasoning capabilities.\n","authors":["Abdolmahdi Bagheri","Matin Alinejad","Kevin Bello","Alireza Akhondi-Asl"],"pdf_url":"https://arxiv.org/pdf/2407.18069v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.05836 by other authors"},{"id":"http://arxiv.org/abs/2304.03183v4","updated":"2024-07-25T13:42:35Z","published":"2023-04-06T16:08:38Z","title":"History-deterministic Timed Automata","summary":" We explore the notion of history-determinism in the context of timed automata\n(TA) over infinite timed words. History-deterministic (HD) automata are those\nin which nondeterminism can be resolved on the fly, based on the run\nconstructed thus far. History-determinism is a robust property that admits\ndifferent game-based characterisations, and HD specifications allow for\ngame-based verification without an expensive determinization step.\n We show that the class of timed $\\omega$-languages recognized by HD timed\nautomata strictly extends that of deterministic ones, and is strictly included\nin those recognised by fully non-deterministic TA.\n For non-deterministic timed automata it is known that universality is already\nundecidable for safety/reachability TA. For history-deterministic TA with\narbitrary parity acceptance, we show that timed universality, inclusion, and\nsynthesis all remain decidable and are EXPTIME-complete.\n For the subclass of TA with safety or reachability acceptance, one can decide\n(in EXPTIME) whether such an automaton is history-deterministic. If so, it can\neffectively determinized without introducing new automata states.\n","authors":["Sougata Bose","Thomas A. Henzinger","Karoliina Lehtinen","Sven Schewe","Patrick Totzke"],"pdf_url":"https://arxiv.org/pdf/2304.03183v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18006v1","updated":"2024-07-25T13:02:06Z","published":"2024-07-25T13:02:06Z","title":"The Existential Theory of the Reals as a Complexity Class: A Compendium","summary":" We survey the complexity class $\\exists \\mathbb{R}$, which captures the\ncomplexity of deciding the existential theory of the reals. The class $\\exists\n\\mathbb{R}$ has roots in two different traditions, one based on the\nBlum-Shub-Smale model of real computation, and the other following work by\nMn\\\"{e}v and Shor on the universality of realization spaces of oriented\nmatroids. Over the years the number of problems for which $\\exists \\mathbb{R}$\nrather than NP has turned out to be the proper way of measuring their\ncomplexity has grown, particularly in the fields of computational geometry,\ngraph drawing, game theory, and some areas in logic and algebra. $\\exists\n\\mathbb{R}$ has also started appearing in the context of machine learning,\nMarkov decision processes, and probabilistic reasoning.\n We have aimed at collecting a comprehensive compendium of problems complete\nand hard for $\\exists \\mathbb{R}$, as well as a long list of open problems. The\ncompendium is presented in the third part of our survey; a tour through the\ncompendium and the areas it touches on makes up the second part. The first part\nintroduces the reader to the existential theory of the reals as a complexity\nclass, discussing its history, motivation and prospects as well as some\ntechnical aspects.\n","authors":["Marcus Schaefer","Jean Cardinal","Tillmann Miltzow"],"pdf_url":"https://arxiv.org/pdf/2407.18006v1.pdf","comment":"126 pages, 12 figures, 6 tables, about 150 complete problems and\n about 50 open problems"},{"id":"http://arxiv.org/abs/2407.17951v1","updated":"2024-07-25T11:15:57Z","published":"2024-07-25T11:15:57Z","title":"Pruning Boolean d-DNNF Circuits Through Tseitin-Awareness","summary":" Boolean circuits in d-DNNF form enable tractable probabilistic inference.\nHowever, as a key insight of this work, we show that commonly used d-DNNF\ncompilation approaches introduce irrelevant subcircuits. We call these\nsubcircuits Tseitin artifacts, as they are introduced due to the Tseitin\ntransformation step -- a well-established procedure to transform any circuit\ninto the CNF format required by several d-DNNF knowledge compilers. We discuss\nhow to detect and remove both Tseitin variables and Tseitin artifacts, leading\nto more succinct circuits. We empirically observe an average size reduction of\n77.5% when removing both Tseitin variables and artifacts. The additional\npruning of Tseitin artifacts reduces the size by 22.2% on average. This\nsignificantly improves downstream tasks that benefit from a more succinct\ncircuit, e.g., probabilistic inference tasks.\n","authors":["Vincent Derkinderen"],"pdf_url":"https://arxiv.org/pdf/2407.17951v1.pdf","comment":"submitted to ICTAI 2024"},{"id":"http://arxiv.org/abs/2407.17947v1","updated":"2024-07-25T11:07:28Z","published":"2024-07-25T11:07:28Z","title":"Supercritical Size-Width Tree-Like Resolution Trade-Offs for Graph\n Isomorphism","summary":" We study the refutation complexity of graph isomorphism in the tree-like\nresolution calculus. Tor\\'an and W\\\"orz (TOCL 2023) showed that there is a\nresolution refutation of narrow width $k$ for two graphs if and only if they\ncan be distinguished in ($k+1$)-variable first-order logic (FO$^{k+1}$) and\nhence by a count-free variant of the $k$-dimensional Weisfeiler-Leman\nalgorithm. While DAG-like narrow width $k$ resolution refutations have size at\nmost $n^k$, tree-like refutations may be much larger. We show that there are\ngraphs of order n, whose isomorphism can be refuted in narrow width $k$ but\nonly in tree-like size $2^{\\Omega(n^{k/2})}$. This is a supercritical trade-off\nwhere bounding one parameter (the narrow width) causes the other parameter (the\nsize) to grow above its worst case. The size lower bound is super-exponential\nin the formula size and improves a related supercritical width versus tree-like\nsize trade-off by Razborov (JACM 2016). To prove our result, we develop a new\nvariant of the $k$-pebble EF-game for FO$^k$ to reason about tree-like\nrefutation size in a similar way as the Prover-Delayer games in proof\ncomplexity. We analyze this game on a modified variant of the compressed CFI\ngraphs introduced by Grohe, Lichter, Neuen, and Schweitzer (FOCS 2023). Using a\nrecent improved robust compressed CFI construction of Janett, Nordstr\\\"om, and\nPang (unpublished manuscript), we obtain a similar bound for width $k$ (instead\nof the stronger but less common narrow width) and make the result more robust.\n","authors":["Christoph Berkholz","Moritz Lichter","Harry Vinall-Smeeth"],"pdf_url":"https://arxiv.org/pdf/2407.17947v1.pdf","comment":"32 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.10616v5","updated":"2024-07-25T07:29:09Z","published":"2024-04-16T14:41:35Z","title":"One is all you need: Second-order Unification without First-order\n Variables","summary":" We introduce a fragment of second-order unification, referred to as\n\\emph{Second-Order Ground Unification (SOGU)}, with the following properties:\n(i) only one second-order variable is allowed, and (ii) first-order variables\ndo not occur. We study an equational variant of SOGU where the signature\ncontains \\textit{associative} binary function symbols (ASOGU) and show that\nHilbert's 10$^{th}$ problem is reducible to ASOGU unifiability, thus proving\nundecidability. Our reduction provides a new lower bound for the undecidability\nof second-order unification, as previous results required first-order variable\noccurrences, multiple second-order variables, and/or equational theories\ninvolving \\textit{length-reducing} rewrite systems. Furthermore, our reduction\nholds even in the case when associativity of the binary function symbol is\nrestricted to \\emph{power associative}, i.e. f(f(x,x),x)= f(x,f(x,x)), as our\nconstruction requires a single constant.\n","authors":["David M. Cerna","Julian Parsert"],"pdf_url":"https://arxiv.org/pdf/2404.10616v5.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2102.07401v2","updated":"2024-07-25T15:40:48Z","published":"2021-02-15T09:00:02Z","title":"Model-bounded monitoring of hybrid systems","summary":" Monitoring of hybrid systems attracts both scientific and practical\nattention. However, monitoring algorithms suffer from the methodological\ndifficulty of only observing sampled discrete-time signals, while real\nbehaviors are continuous-time signals. To mitigate this problem of sampling\nuncertainties, we introduce a model-bounded monitoring scheme, where we use\nprior knowledge about the target system to prune interpolation candidates.\nTechnically, we express such prior knowledge by linear hybrid automata (LHAs)\n-- the LHAs are called bounding models. We introduce a novel notion of\nmonitored language of LHAs, and we reduce the monitoring problem to the\nmembership problem of the monitored language. We present two partial algorithms\n-- one is via reduction to reachability in LHAs and the other is a direct one\nusing polyhedra -- and show that these methods, and thus the proposed\nmodel-bounded monitoring scheme, are efficient and practically relevant.\n","authors":["Masaki Waga","Étienne André","Ichiro Hasuo"],"pdf_url":"https://arxiv.org/pdf/2102.07401v2.pdf","comment":"This is the author version of the manuscript of the same name\n published in the ACM Transactions on Cyber-Physical Systems"},{"id":"http://arxiv.org/abs/2001.11906v7","updated":"2024-07-25T14:02:19Z","published":"2020-01-31T15:32:58Z","title":"Zeta Functions and the (Linear) Logic of Markov Processes","summary":" The author introduced models of linear logic known as ''Interaction Graphs''\nwhich generalise Girard's various geometry of interaction constructions. In\nthis work, we establish how these models essentially rely on a deep connection\nbetween zeta functions and the execution of programs, expressed as a cocycle.\nThis is first shown in the simple case of graphs, before begin lifted to\ndynamical systems. Focussing on probabilistic models, we then explain how the\nnotion of graphings used in Interaction Graphs captures a natural class of\nsub-Markov processes. We then extend the realisability constructions and the\nnotion of zeta function to provide a realisability model of second-order linear\nlogic over the set of all (discrete-time) sub-Markov processes.\n","authors":["Thomas Seiller"],"pdf_url":"https://arxiv.org/pdf/2001.11906v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18420v1","updated":"2024-07-25T22:41:24Z","published":"2024-07-25T22:41:24Z","title":"On Polynomial-Time Decidability of k-Negations Fragments of First-Order\n Theories","summary":" This paper introduces a generic framework that provides sufficient conditions\nfor guaranteeing polynomial-time decidability of fixed-negation fragments of\nfirst-order theories that adhere to certain fixed-parameter tractability\nrequirements. It enables deciding sentences of such theories with arbitrary\nexistential quantification, conjunction and a fixed number of negation symbols\nin polynomial time. It was recently shown by Nguyen and Pak [SIAM J. Comput.\n51(2): 1--31 (2022)] that an even more restricted such fragment of Presburger\narithmetic (the first-order theory of the integers with addition and order) is\nNP-hard. In contrast, by application of our framework, we show that the fixed\nnegation fragment of weak Presburger arithmetic, which drops the order relation\nfrom Presburger arithmetic in favour of equality, is decidable in polynomial\ntime.\n","authors":["Christoph Haase","Alessio Mansutti","Amaury Pouly"],"pdf_url":"https://arxiv.org/pdf/2407.18420v1.pdf","comment":null}]},"2024-07-26T00:00:00Z":{"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2407.17790v2","updated":"2024-07-26T01:14:52Z","published":"2024-07-25T05:52:48Z","title":"Exploring the Limitations of Kolmogorov-Arnold Networks in\n Classification: Insights to Software Training and Hardware Implementation","summary":" Kolmogorov-Arnold Networks (KANs), a novel type of neural network, have\nrecently gained popularity and attention due to the ability to substitute\nmulti-layer perceptions (MLPs) in artificial intelligence (AI) with higher\naccuracy and interoperability. However, KAN assessment is still limited and\ncannot provide an in-depth analysis of a specific domain. Furthermore, no study\nhas been conducted on the implementation of KANs in hardware design, which\nwould directly demonstrate whether KANs are truly superior to MLPs in practical\napplications. As a result, in this paper, we focus on verifying KANs for\nclassification issues, which are a common but significant topic in AI using\nfour different types of datasets. Furthermore, the corresponding hardware\nimplementation is considered using the Vitis high-level synthesis (HLS) tool.\nTo the best of our knowledge, this is the first article to implement hardware\nfor KAN. The results indicate that KANs cannot achieve more accuracy than MLPs\nin high complex datasets while utilizing substantially higher hardware\nresources. Therefore, MLP remains an effective approach for achieving accuracy\nand efficiency in software and hardware implementation.\n","authors":["Van Duy Tran","Tran Xuan Hieu Le","Thi Diem Tran","Hoai Luan Pham","Vu Trung Duong Le","Tuan Hai Vu","Van Tinh Nguyen","Yasuhiko Nakashima"],"pdf_url":"https://arxiv.org/pdf/2407.17790v2.pdf","comment":"6 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.15353v2","updated":"2024-07-26T08:36:25Z","published":"2024-07-22T03:44:27Z","title":"Customized Retrieval Augmented Generation and Benchmarking for EDA Tool\n Documentation QA","summary":" Retrieval augmented generation (RAG) enhances the accuracy and reliability of\ngenerative AI models by sourcing factual information from external databases,\nwhich is extensively employed in document-grounded question-answering (QA)\ntasks. Off-the-shelf RAG flows are well pretrained on general-purpose\ndocuments, yet they encounter significant challenges when being applied to\nknowledge-intensive vertical domains, such as electronic design automation\n(EDA). This paper addresses such issue by proposing a customized RAG framework\nalong with three domain-specific techniques for EDA tool documentation QA,\nincluding a contrastive learning scheme for text embedding model fine-tuning, a\nreranker distilled from proprietary LLM, and a generative LLM fine-tuned with\nhigh-quality domain corpus. Furthermore, we have developed and released a\ndocumentation QA evaluation benchmark, ORD-QA, for OpenROAD, an advanced\nRTL-to-GDSII design platform. Experimental results demonstrate that our\nproposed RAG flow and techniques have achieved superior performance on ORD-QA\nas well as on a commercial tool, compared with state-of-the-arts. The ORD-QA\nbenchmark and the training dataset for our customized RAG flow are open-source\nat https://github.com/lesliepy99/RAG-EDA.\n","authors":["Yuan Pu","Zhuolun He","Tairu Qiu","Haoyuan Wu","Bei Yu"],"pdf_url":"https://arxiv.org/pdf/2407.15353v2.pdf","comment":"Accepted by ICCAD 2024"},{"id":"http://arxiv.org/abs/2407.18499v1","updated":"2024-07-26T04:15:54Z","published":"2024-07-26T04:15:54Z","title":"Non-Overlapping Placement of Macro Cells based on Reinforcement Learning\n in Chip Design","summary":" Due to the increasing complexity of chip design, existing placement methods\nstill have many shortcomings in dealing with macro cells coverage and\noptimization efficiency. Aiming at the problems of layout overlap, inferior\nperformance, and low optimization efficiency in existing chip design methods,\nthis paper proposes an end-to-end placement method, SRLPlacer, based on\nreinforcement learning. First, the placement problem is transformed into a\nMarkov decision process by establishing the coupling relationship graph model\nbetween macro cells to learn the strategy for optimizing layouts. Secondly, the\nwhole placement process is optimized after integrating the standard cell\nlayout. By assessing on the public benchmark ISPD2005, the proposed SRLPlacer\ncan effectively solve the overlap problem between macro cells while considering\nrouting congestion and shortening the total wire length to ensure routability.\n","authors":["Tao Yu","Peng Gao","Fei Wang","Ru-Yue Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.18499v1.pdf","comment":null}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2407.18004v2","updated":"2024-07-26T09:28:51Z","published":"2024-07-25T12:59:59Z","title":"Optimal Broadcast Schedules in Logarithmic Time with Applications to\n Broadcast, All-Broadcast, Reduction and All-Reduction","summary":" We give optimally fast $O(\\log p)$ time (per processor) algorithms for\ncomputing round-optimal broadcast schedules for message-passing parallel\ncomputing systems. This affirmatively answers difficult questions posed in a\nSPAA 2022 BA and a CLUSTER 2022 paper. We observe that the computed schedules\nand circulant communication graph can likewise be used for reduction,\nall-broadcast and all-reduction as well, leading to new, round-optimal\nalgorithms for these problems. These observations affirmatively answer open\nquestions posed in a CLUSTER 2023 paper.\n The problem is to broadcast $n$ indivisible blocks of data from a given root\nprocessor to all other processors in a (subgraph of a) fully connected network\nof $p$ processors with fully bidirectional, one-ported communication\ncapabilities. In this model, $n-1+\\lceil\\log_2 p\\rceil$ communication rounds\nare required. Our new algorithms compute for each processor in the network\nreceive and send schedules each of size $\\lceil\\log_2 p\\rceil$ that determine\nuniquely in $O(1)$ time for each communication round the new block that the\nprocessor will receive, and the already received block it has to send. Schedule\ncomputations are done independently per processor without communication. The\nbroadcast communication subgraph is an easily computable, directed,\n$\\lceil\\log_2 p\\rceil$-regular circulant graph also used elsewhere. We show how\nthe schedule computations can be done in optimal time and space of $O(\\log p)$,\nimproving significantly over previous results of $O(p\\log^2 p)$ and $O(\\log^3\np)$, respectively. The schedule computation and broadcast algorithms are simple\nto implement, but correctness and complexity are not obvious. The schedules are\nused for new implementations of the MPI (Message-Passing Interface) collectives\nMPI_Bcast, MPI_Allgatherv, MPI_Reduce and MPI_Reduce_scatter. Preliminary\nexperimental results are given.\n","authors":["Jesper Larsson Träff"],"pdf_url":"https://arxiv.org/pdf/2407.18004v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2312.11236"},{"id":"http://arxiv.org/abs/2407.17676v2","updated":"2024-07-26T02:15:47Z","published":"2024-07-25T00:00:46Z","title":"Empowering the Quantum Cloud User with QRIO","summary":" Quantum computing is moving swiftly from theoretical to practical\napplications, making it crucial to establish a significant quantum advantage.\nDespite substantial investments, access to quantum devices is still limited,\nwith users facing issues like long wait times and inefficient resource\nmanagement. Unlike the mature cloud solutions for classical computing, quantum\ncomputing lacks effective infrastructure for resource optimization. We propose\na Quantum Resource Infrastructure Orchestrator (QRIO), a state-of-the-art cloud\nresource manager built on Kubernetes that is tailored to quantum computing.\nQRIO seeks to democratize access to quantum devices by providing customizable,\nuser-friendly, open-source resource management. QRIO's design aims to ensure\nequitable access, optimize resource utilization, and support diverse\napplications, thereby speeding up innovation and making quantum computing more\naccessible and efficient to a broader user base. In this paper, we discuss\nQRIO's various features and evaluate its capability in several representative\nusecases.\n","authors":["Shmeelok Chakraborty","Yuewen Hou","Ang Chen","Gokul Subramanian Ravi"],"pdf_url":"https://arxiv.org/pdf/2407.17676v2.pdf","comment":"To appear at the IEEE International Symposium on Workload\n Characterization, 2024"},{"id":"http://arxiv.org/abs/2407.13618v2","updated":"2024-07-26T08:01:30Z","published":"2024-07-18T15:55:52Z","title":"DDS: DPU-optimized Disaggregated Storage [Extended Report]","summary":" This extended report presents DDS, a novel disaggregated storage architecture\nenabled by emerging networking hardware, namely DPUs (Data Processing Units).\nDPUs can optimize the latency and CPU consumption of disaggregated storage\nservers. However, utilizing DPUs for DBMSs requires careful design of the\nnetwork and storage paths and the interface exposed to the DBMS. To fully\nbenefit from DPUs, DDS heavily uses DMA, zero-copy, and userspace I/O to\nminimize overhead when improving throughput. It also introduces an offload\nengine that eliminates host CPUs by executing client requests directly on the\nDPU. Adopting DDS' API requires minimal DBMS modification. Our experimental\nstudy and production system integration show promising results -- DDS achieves\nhigher disaggregated storage throughput with an order of magnitude lower\nlatency, and saves up to tens of CPU cores per storage server.\n","authors":["Qizhen Zhang","Philip Bernstein","Badrish Chandramouli","Jiasheng Hu","Yiming Zheng"],"pdf_url":"https://arxiv.org/pdf/2407.13618v2.pdf","comment":"This work has been accepted by VLDB 2024"},{"id":"http://arxiv.org/abs/2407.14575v2","updated":"2024-07-26T17:35:20Z","published":"2024-07-19T16:19:14Z","title":"Regression prediction algorithm for energy consumption regression in\n cloud computing based on horned lizard algorithm optimised convolutional\n neural network-bidirectional gated recurrent unit","summary":" For this paper, a prediction study of cloud computing energy consumption was\nconducted by optimising the data regression algorithm based on the horned\nlizard optimisation algorithm for Convolutional Neural Networks-Bi-Directional\nGated Recurrent Units. Firstly, through Spearman correlation analysis of CPU,\nusage, memory usage, network traffic, power consumption, number of instructions\nexecuted, execution time and energy efficiency, we found that power consumption\nhas the highest degree of positive correlation with energy efficiency, while\nCPU usage has the highest degree of negative correlation with energy\nefficiency. In our experiments, we introduced a random forest model and an\noptimisation model based on the horned lizard optimisation algorithm for\ntesting, and the results show that the optimisation algorithm has better\nprediction results compared to the random forest model. Specifically, the mean\nsquare error (MSE) of the optimisation algorithm is 0.01 smaller than that of\nthe random forest model, and the mean absolute error (MAE) is 0.01 smaller than\nthat of the random forest.3 The results of the combined metrics show that the\noptimisation algorithm performs more accurately and reliably in predicting\nenergy efficiency. This research result provides new ideas and methods to\nimprove the energy efficiency of cloud computing systems. This research not\nonly expands the scope of application in the field of cloud computing, but also\nprovides a strong support for improving the energy use efficiency of the\nsystem.\n","authors":["Feiyang Li","Zinan Cao","Qixuan Yu","Xirui Tang"],"pdf_url":"https://arxiv.org/pdf/2407.14575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18795v1","updated":"2024-07-26T14:58:22Z","published":"2024-07-26T14:58:22Z","title":"Lectures on Parallel Computing","summary":" These lecture notes are designed to accompany an imaginary, virtual,\nundergraduate, one or two semester course on fundamentals of Parallel Computing\nas well as to serve as background and reference for graduate courses on\nHigh-Performance Computing, parallel algorithms and shared-memory\nmultiprocessor programming. They introduce theoretical concepts and tools for\nexpressing, analyzing and judging parallel algorithms and, in detail, cover the\ntwo most widely used concrete frameworks OpenMP and MPI as well as the\nthreading interface pthreads for writing parallel programs for either shared or\ndistributed memory parallel computers with emphasis on general concepts and\nprinciples. Code examples are given in a C-like style and many are actual,\ncorrect C code. The lecture notes deliberately do not cover GPU architectures\nand GPU programming, but the general concerns, guidelines and principles (time,\nwork, cost, efficiency, scalability, memory structure and bandwidth) will be\njust as relevant for efficiently utilizing various GPU architectures. Likewise,\nthe lecture notes focus on deterministic algorithms only and do not use\nrandomization. The student of this material will find it instructive to take\nthe time to understand concepts and algorithms visually. The exercises can be\nused for self-study and as inspiration for small implementation projects in\nOpenMP and MPI that can and should accompany any serious course on Parallel\nComputing. The student will benefit from actually implementing and carefully\nbenchmarking the suggested algorithms on the parallel computing system that may\nor should be made available as part of such a Parallel Computing course. In\nclass, the exercises can be used as basis for hand-ins and small programming\nprojects for which sufficient, additional detail and precision should be\nprovided by the instructor.\n","authors":["Jesper Larsson Träff"],"pdf_url":"https://arxiv.org/pdf/2407.18795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14250v4","updated":"2024-07-26T13:21:54Z","published":"2024-04-22T15:02:41Z","title":"Frosty: Bringing strong liveness guarantees to the Snow family of\n consensus protocols","summary":" Snowman is the consensus protocol implemented by the Avalanche blockchain and\nis part of the Snow family of protocols, first introduced through the original\nAvalanche leaderless consensus protocol. A major advantage of Snowman is that\neach consensus decision only requires an expected constant communication\noverhead per processor in the `common' case that the protocol is not under\nsubstantial Byzantine attack, i.e. it provides a solution to the scalability\nproblem which ensures that the expected communication overhead per processor is\nindependent of the total number of processors $n$ during normal operation. This\nis the key property that would enable a consensus protocol to scale to 10,000\nor more independent validators (i.e. processors). On the other hand, the two\nfollowing concerns have remained:\n (1) Providing formal proofs of consistency for Snowman has presented a\nformidable challenge.\n (2) Liveness attacks exist in the case that a Byzantine adversary controls\nmore than $O(\\sqrt{n})$ processors, slowing termination to more than a\nlogarithmic number of steps.\n In this paper, we address the two issues above. We consider a Byzantine\nadversary that controls at most $f + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Hardware Architecturea 3 + +
+
+
+ + ☆ Non-Overlapping Placement of Macro Cells based on Reinforcement Learning + in Chip Design + + +
+ Due to the increasing complexity of chip design, existing placement methods +still have many shortcomings in dealing with macro cells coverage and +optimization efficiency. Aiming at the problems of layout overlap, inferior +performance, and low optimization efficiency in existing chip design methods, +this paper proposes an end-to-end placement method, SRLPlacer, based on +reinforcement learning. First, the placement problem is transformed into a +Markov decision process by establishing the coupling relationship graph model +between macro cells to learn the strategy for optimizing layouts. Secondly, the +whole placement process is optimized after integrating the standard cell +layout. By assessing on the public benchmark ISPD2005, the proposed SRLPlacer +can effectively solve the overlap problem between macro cells while considering +routing congestion and shortening the total wire length to ensure routability. + +
+
+
+
+
+ + ♻ ☆ Exploring the Limitations of Kolmogorov-Arnold Networks in + Classification: Insights to Software Training and Hardware Implementation + + +
+ Kolmogorov-Arnold Networks (KANs), a novel type of neural network, have +recently gained popularity and attention due to the ability to substitute +multi-layer perceptions (MLPs) in artificial intelligence (AI) with higher +accuracy and interoperability. However, KAN assessment is still limited and +cannot provide an in-depth analysis of a specific domain. Furthermore, no study +has been conducted on the implementation of KANs in hardware design, which +would directly demonstrate whether KANs are truly superior to MLPs in practical +applications. As a result, in this paper, we focus on verifying KANs for +classification issues, which are a common but significant topic in AI using +four different types of datasets. Furthermore, the corresponding hardware +implementation is considered using the Vitis high-level synthesis (HLS) tool. +To the best of our knowledge, this is the first article to implement hardware +for KAN. The results indicate that KANs cannot achieve more accuracy than MLPs +in high complex datasets while utilizing substantially higher hardware +resources. Therefore, MLP remains an effective approach for achieving accuracy +and efficiency in software and hardware implementation. + +
+
+ comment: 6 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Customized Retrieval Augmented Generation and Benchmarking for EDA Tool + Documentation QA + + +
+ Retrieval augmented generation (RAG) enhances the accuracy and reliability of +generative AI models by sourcing factual information from external databases, +which is extensively employed in document-grounded question-answering (QA) +tasks. Off-the-shelf RAG flows are well pretrained on general-purpose +documents, yet they encounter significant challenges when being applied to +knowledge-intensive vertical domains, such as electronic design automation +(EDA). This paper addresses such issue by proposing a customized RAG framework +along with three domain-specific techniques for EDA tool documentation QA, +including a contrastive learning scheme for text embedding model fine-tuning, a +reranker distilled from proprietary LLM, and a generative LLM fine-tuned with +high-quality domain corpus. Furthermore, we have developed and released a +documentation QA evaluation benchmark, ORD-QA, for OpenROAD, an advanced +RTL-to-GDSII design platform. Experimental results demonstrate that our +proposed RAG flow and techniques have achieved superior performance on ORD-QA +as well as on a commercial tool, compared with state-of-the-arts. The ORD-QA +benchmark and the training dataset for our customized RAG flow are open-source +at https://github.com/lesliepy99/RAG-EDA. + +
+
+ comment: Accepted by ICCAD 2024 +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 8 + +
+
+
+ + ☆ Lectures on Parallel Computing + + +
+ These lecture notes are designed to accompany an imaginary, virtual, +undergraduate, one or two semester course on fundamentals of Parallel Computing +as well as to serve as background and reference for graduate courses on +High-Performance Computing, parallel algorithms and shared-memory +multiprocessor programming. They introduce theoretical concepts and tools for +expressing, analyzing and judging parallel algorithms and, in detail, cover the +two most widely used concrete frameworks OpenMP and MPI as well as the +threading interface pthreads for writing parallel programs for either shared or +distributed memory parallel computers with emphasis on general concepts and +principles. Code examples are given in a C-like style and many are actual, +correct C code. The lecture notes deliberately do not cover GPU architectures +and GPU programming, but the general concerns, guidelines and principles (time, +work, cost, efficiency, scalability, memory structure and bandwidth) will be +just as relevant for efficiently utilizing various GPU architectures. Likewise, +the lecture notes focus on deterministic algorithms only and do not use +randomization. The student of this material will find it instructive to take +the time to understand concepts and algorithms visually. The exercises can be +used for self-study and as inspiration for small implementation projects in +OpenMP and MPI that can and should accompany any serious course on Parallel +Computing. The student will benefit from actually implementing and carefully +benchmarking the suggested algorithms on the parallel computing system that may +or should be made available as part of such a Parallel Computing course. In +class, the exercises can be used as basis for hand-ins and small programming +projects for which sufficient, additional detail and precision should be +provided by the instructor. + +
+
+
+
+
+ + ♻ ☆ Optimal Broadcast Schedules in Logarithmic Time with Applications to + Broadcast, All-Broadcast, Reduction and All-Reduction + + +
+ We give optimally fast $O(\log p)$ time (per processor) algorithms for +computing round-optimal broadcast schedules for message-passing parallel +computing systems. This affirmatively answers difficult questions posed in a +SPAA 2022 BA and a CLUSTER 2022 paper. We observe that the computed schedules +and circulant communication graph can likewise be used for reduction, +all-broadcast and all-reduction as well, leading to new, round-optimal +algorithms for these problems. These observations affirmatively answer open +questions posed in a CLUSTER 2023 paper. + The problem is to broadcast $n$ indivisible blocks of data from a given root +processor to all other processors in a (subgraph of a) fully connected network +of $p$ processors with fully bidirectional, one-ported communication +capabilities. In this model, $n-1+\lceil\log_2 p\rceil$ communication rounds +are required. Our new algorithms compute for each processor in the network +receive and send schedules each of size $\lceil\log_2 p\rceil$ that determine +uniquely in $O(1)$ time for each communication round the new block that the +processor will receive, and the already received block it has to send. Schedule +computations are done independently per processor without communication. The +broadcast communication subgraph is an easily computable, directed, +$\lceil\log_2 p\rceil$-regular circulant graph also used elsewhere. We show how +the schedule computations can be done in optimal time and space of $O(\log p)$, +improving significantly over previous results of $O(p\log^2 p)$ and $O(\log^3 +p)$, respectively. The schedule computation and broadcast algorithms are simple +to implement, but correctness and complexity are not obvious. The schedules are +used for new implementations of the MPI (Message-Passing Interface) collectives +MPI_Bcast, MPI_Allgatherv, MPI_Reduce and MPI_Reduce_scatter. Preliminary +experimental results are given. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2312.11236 +
+
+
+
+
+ + ♻ ☆ Empowering the Quantum Cloud User with QRIO + + +
+ Quantum computing is moving swiftly from theoretical to practical +applications, making it crucial to establish a significant quantum advantage. +Despite substantial investments, access to quantum devices is still limited, +with users facing issues like long wait times and inefficient resource +management. Unlike the mature cloud solutions for classical computing, quantum +computing lacks effective infrastructure for resource optimization. We propose +a Quantum Resource Infrastructure Orchestrator (QRIO), a state-of-the-art cloud +resource manager built on Kubernetes that is tailored to quantum computing. +QRIO seeks to democratize access to quantum devices by providing customizable, +user-friendly, open-source resource management. QRIO's design aims to ensure +equitable access, optimize resource utilization, and support diverse +applications, thereby speeding up innovation and making quantum computing more +accessible and efficient to a broader user base. In this paper, we discuss +QRIO's various features and evaluate its capability in several representative +usecases. + +
+
+ comment: To appear at the IEEE International Symposium on Workload + Characterization, 2024 +
+
+
+
+
+ + ♻ ☆ DDS: DPU-optimized Disaggregated Storage [Extended Report] + + +
+ This extended report presents DDS, a novel disaggregated storage architecture +enabled by emerging networking hardware, namely DPUs (Data Processing Units). +DPUs can optimize the latency and CPU consumption of disaggregated storage +servers. However, utilizing DPUs for DBMSs requires careful design of the +network and storage paths and the interface exposed to the DBMS. To fully +benefit from DPUs, DDS heavily uses DMA, zero-copy, and userspace I/O to +minimize overhead when improving throughput. It also introduces an offload +engine that eliminates host CPUs by executing client requests directly on the +DPU. Adopting DDS' API requires minimal DBMS modification. Our experimental +study and production system integration show promising results -- DDS achieves +higher disaggregated storage throughput with an order of magnitude lower +latency, and saves up to tens of CPU cores per storage server. + +
+
+ comment: This work has been accepted by VLDB 2024 +
+
+
+
+
+ + ♻ ☆ Regression prediction algorithm for energy consumption regression in + cloud computing based on horned lizard algorithm optimised convolutional + neural network-bidirectional gated recurrent unit + + +
+ For this paper, a prediction study of cloud computing energy consumption was +conducted by optimising the data regression algorithm based on the horned +lizard optimisation algorithm for Convolutional Neural Networks-Bi-Directional +Gated Recurrent Units. Firstly, through Spearman correlation analysis of CPU, +usage, memory usage, network traffic, power consumption, number of instructions +executed, execution time and energy efficiency, we found that power consumption +has the highest degree of positive correlation with energy efficiency, while +CPU usage has the highest degree of negative correlation with energy +efficiency. In our experiments, we introduced a random forest model and an +optimisation model based on the horned lizard optimisation algorithm for +testing, and the results show that the optimisation algorithm has better +prediction results compared to the random forest model. Specifically, the mean +square error (MSE) of the optimisation algorithm is 0.01 smaller than that of +the random forest model, and the mean absolute error (MAE) is 0.01 smaller than +that of the random forest.3 The results of the combined metrics show that the +optimisation algorithm performs more accurately and reliably in predicting +energy efficiency. This research result provides new ideas and methods to +improve the energy efficiency of cloud computing systems. This research not +only expands the scope of application in the field of cloud computing, but also +provides a strong support for improving the energy use efficiency of the +system. + +
+
+
+
+
+ + ♻ ☆ Frosty: Bringing strong liveness guarantees to the Snow family of + consensus protocols + + +
+ Snowman is the consensus protocol implemented by the Avalanche blockchain and +is part of the Snow family of protocols, first introduced through the original +Avalanche leaderless consensus protocol. A major advantage of Snowman is that +each consensus decision only requires an expected constant communication +overhead per processor in the `common' case that the protocol is not under +substantial Byzantine attack, i.e. it provides a solution to the scalability +problem which ensures that the expected communication overhead per processor is +independent of the total number of processors $n$ during normal operation. This +is the key property that would enable a consensus protocol to scale to 10,000 +or more independent validators (i.e. processors). On the other hand, the two +following concerns have remained: + (1) Providing formal proofs of consistency for Snowman has presented a +formidable challenge. + (2) Liveness attacks exist in the case that a Byzantine adversary controls +more than $O(\sqrt{n})$ processors, slowing termination to more than a +logarithmic number of steps. + In this paper, we address the two issues above. We consider a Byzantine +adversary that controls at most $f +
+
+
+
+
+ + ♻ ☆ Detrimental task execution patterns in mainstream OpenMP runtimes + + +
+ The OpenMP API offers both task-based and data-parallel concepts to +scientific computing. While it provides descriptive and prescriptive +annotations, it is in many places deliberately unspecific how to implement its +annotations. As the predominant OpenMP implementations share design rationales, +they introduce "quasi-standards how certain annotations behave. By means of a +task-based astrophysical simulation code, we highlight situations where this +"quasi-standard" reference behaviour introduces performance flaws. Therefore, +we propose prescriptive clauses to constrain the OpenMP implementations. +Simulated task traces uncover the clauses' potential, while a discussion of +their realization highlights that they would manifest in rather incremental +changes to any OpenMP runtime supporting task priorities. + +
+
+
+
+
+ + ♻ ☆ Software Resource Disaggregation for HPC with Serverless Computing + + +
+ Aggregated HPC resources have rigid allocation systems and programming models +which struggle to adapt to diverse and changing workloads. Consequently, HPC +systems fail to efficiently use the large pools of unused memory and increase +the utilization of idle computing resources. Prior work attempted to increase +the throughput and efficiency of supercomputing systems through workload +co-location and resource disaggregation. However, these methods fall short of +providing a solution that can be applied to existing systems without major +hardware modifications and performance losses. In this paper, we improve the +utilization of supercomputers by employing the new cloud paradigm of serverless +computing. We show how serverless functions provide fine-grained access to the +resources of batch-managed cluster nodes. We present an HPC-oriented +Function-as-a-Service (FaaS) that satisfies the requirements of +high-performance applications. We demonstrate a software resource +disaggregation approach where placing functions on unallocated and +underutilized nodes allows idle cores and accelerators to be utilized while +retaining near-native performance. + +
+
+ comment: Accepted for publication in the 2024 International Parallel and + Distributed Processing Symposium (IPDPS) +
+
+
+
+
+
+
+
+ + Programming and Languages 3 + +
+
+
+ + ♻ ☆ Bluefish: Composing Diagrams with Declarative Relations + + +
+ Diagrams are essential tools for problem-solving and communication as they +externalize conceptual structures using spatial relationships. But when picking +a diagramming framework, users are faced with a dilemma. They can either use a +highly expressive but low-level toolkit, whose API does not match their +domain-specific concepts, or select a high-level typology, which offers a +recognizable vocabulary but supports a limited range of diagrams. To address +this gap, we introduce Bluefish: a diagramming framework inspired by +component-based user interface (UI) libraries. Bluefish lets users create +diagrams using relations: declarative, composable, and extensible diagram +fragments that relax the concept of a UI component. Unlike a component, a +relation does not have sole ownership over its children nor does it need to +fully specify their layout. To render diagrams, Bluefish extends a traditional +tree-based scenegraph to a compound graph that captures both hierarchical and +adjacent relationships between nodes. To evaluate our system, we construct a +diverse example gallery covering many domains including mathematics, physics, +computer science, and even cooking. We show that Bluefish's relations are +effective declarative primitives for diagrams. Bluefish is open source, and we +aim to shape it into both a usable tool and a research platform. + +
+
+ comment: 27 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ A Coq Mechanization of JavaScript Regular Expression Semantics + + +
+ We present an executable, proven-safe, faithful, and future-proof Coq +mechanization of JavaScript regular expression (regex) matching, as specified +by the latest published edition of ECMA-262 section 22.2. This is, to our +knowledge, the first time that an industrial-strength regex language has been +faithfully mechanized in an interactive theorem prover. We highlight +interesting challenges that arose in the process (including issues of encoding, +corner cases, and executability), and we document the steps that we took to +ensure that the result is straightforwardly auditable and that our +understanding of the specification aligns with existing implementations. + We demonstrate the usability and versatility of the mechanization through a +broad collection of analyses, case studies, and experiments: we prove that +JavaScript regex matching always terminates and is safe (no assertion +failures); we identify subtle corner cases that led to mistakes in previous +publications; we verify an optimization extracted from a state-of-the-art regex +engine; we show that some classic properties described in automata textbooks +and used in derivatives-based matchers do not hold in JavaScript regexes; and +we demonstrate that the cost of updating the mechanization to account for +changes in the original specification is reasonably low. + Our mechanization can be extracted to OCaml and JavaScript and linked with +Unicode libraries to produce an executable regex engine that passes the +relevant parts of the official Test262 conformance test suite. + +
+
+
+
+
+ + ♻ ☆ Detrimental task execution patterns in mainstream OpenMP runtimes + + +
+ The OpenMP API offers both task-based and data-parallel concepts to +scientific computing. While it provides descriptive and prescriptive +annotations, it is in many places deliberately unspecific how to implement its +annotations. As the predominant OpenMP implementations share design rationales, +they introduce "quasi-standards how certain annotations behave. By means of a +task-based astrophysical simulation code, we highlight situations where this +"quasi-standard" reference behaviour introduces performance flaws. Therefore, +we propose prescriptive clauses to constrain the OpenMP implementations. +Simulated task traces uncover the clauses' potential, while a discussion of +their realization highlights that they would manifest in rather incremental +changes to any OpenMP runtime supporting task priorities. + +
+
+
+
+
+
+
+
+ + Logic in Computer Science 8 + +
+
+
+ + ☆ Repairing Networks of $\mathcal{EL_\perp}$ Ontologies using Weakening + and Completing -- Extended version + + +
+ The quality of ontologies and their alignments is crucial for developing +high-quality semantics-based applications. Traditional debugging techniques +repair ontology networks by removing unwanted axioms and mappings, but may +thereby remove consequences that are correct in the domain of the ontology +network. In this paper we propose a framework for repairing ontology networks +that deals with this issue. It defines basic operations such as debugging, +weakening and completing. Further, it defines combination operators that +reflect choices in how and when to use the basic operators, as well as choices +regarding the autonomy level of the ontologies and alignments in the ontology +network. We show the influence of the combination operators on the quality of +the repaired network and present an implemented tool. By using our framework +together with existing algorithms for debugging, weakening and completing, we +essentially provide a blueprint for extending previous work and systems. + +
+
+ comment: This is a slightly revised and extended version of a paper published + at ISWC 2024. arXiv admin note: text overlap with arXiv:2208.00486 +
+
+
+
+
+ + ☆ LLASP: Fine-tuning Large Language Models for Answer Set Programming + + +
+ Recently, Large Language Models (LLMs) have showcased their potential in +various natural language processing tasks, including code generation. However, +while significant progress has been made in adapting LLMs to generate code for +several imperative programming languages and tasks, there remains a notable gap +in their application to declarative formalisms, such as Answer Set Programming +(ASP). In this paper, we move a step towards exploring the capabilities of LLMs +for ASP code generation. First, we perform a systematic evaluation of several +state-of-the-art LLMs. Despite their power in terms of number of parameters, +training data and computational resources, empirical results demonstrate +inadequate performances in generating correct ASP programs. Therefore, we +propose LLASP, a fine-tuned lightweight model specifically trained to encode +fundamental ASP program patterns. To this aim, we create an ad-hoc dataset +covering a wide variety of fundamental problem specifications that can be +encoded in ASP. Our experiments demonstrate that the quality of ASP programs +generated by LLASP is remarkable. This holds true not only when compared to the +non-fine-tuned counterpart but also when compared to the majority of eager LLM +candidates, particularly from a semantic perspective. All the code and data +used to perform the experiments are publicly available at +https://anonymous.4open.science/r/LLASP-D86C/. + +
+
+
+
+
+ + ☆ A Reliable Common-Sense Reasoning Socialbot Built Using LLMs and + Goal-Directed ASP + + +
+ The development of large language models (LLMs), such as GPT, has enabled the +construction of several socialbots, like ChatGPT, that are receiving a lot of +attention for their ability to simulate a human conversation. However, the +conversation is not guided by a goal and is hard to control. In addition, +because LLMs rely more on pattern recognition than deductive reasoning, they +can give confusing answers and have difficulty integrating multiple topics into +a cohesive response. These limitations often lead the LLM to deviate from the +main topic to keep the conversation interesting. We propose AutoCompanion, a +socialbot that uses an LLM model to translate natural language into predicates +(and vice versa) and employs commonsense reasoning based on Answer Set +Programming (ASP) to hold a social conversation with a human. In particular, we +rely on s(CASP), a goal-directed implementation of ASP as the backend. This +paper presents the framework design and how an LLM is used to parse user +messages and generate a response from the s(CASP) engine output. To validate +our proposal, we describe (real) conversations in which the chatbot's goal is +to keep the user entertained by talking about movies and books, and s(CASP) +ensures (i) correctness of answers, (ii) coherence (and precision) during the +conversation, which it dynamically regulates to achieve its specific purpose, +and (iii) no deviation from the main topic. + +
+
+
+
+
+ + ♻ ☆ Static and Dynamic Verification of OCaml Programs: The Gospel Ecosystem + (Extended Version) + + +
+ We present our work on the collaborative use of dynamic and static analysis +tools for the verification of software written in the OCaml language. We build +upon Gospel, a specification language for OCaml that can be used both in +dynamic and static analyses. We employ Ortac, for runtime assertion checking, +and Cameleer and CFML for the deductive verification of OCaml code. We report +on the use of such tools to build a case study of collaborative analysis of a +non-trivial OCaml program. This shows how these tools nicely complement each +others, while at the same highlights the differences when writing specification +targeting dynamic or static analysis methods. + +
+
+
+
+
+ + ♻ ☆ Preservation theorems for Tarski's relation algebra + + +
+ We investigate a number of semantically defined fragments of Tarski's algebra +of binary relations, including the function-preserving fragment. We address the +question whether they are generated by a finite set of operations. We obtain +several positive and negative results along these lines. Specifically, the +homomorphism-safe fragment is finitely generated (both over finite and over +arbitrary structures). The function-preserving fragment is not finitely +generated (and, in fact, not expressible by any finite set of guarded +second-order definable function-preserving operations). Similarly, the +total-function-preserving fragment is not finitely generated (and, in fact, not +expressible by any finite set of guarded second-order definable +total-function-preserving operations). In contrast, the forward-looking +function-preserving fragment is finitely generated by composition, +intersection, antidomain, and preferential union. Similarly, the +forward-and-backward-looking injective-function-preserving fragment is finitely +generated by composition, intersection, antidomain, inverse, and an `injective +union' operation. + +
+
+
+
+
+ + ♻ ☆ The Algebras for Automatic Relations + + +
+ We introduce "synchronous algebras", an algebraic structure tailored to +recognize automatic relations (aka. synchronous relations, or regular +relations). They are the equivalent of monoids for regular languages, however +they conceptually differ in two points: first, they are typed and second, they +are equipped with a dependency relation expressing constraints between elements +of different types. + The interest of the proposed definition is that it allows to lift, in an +effective way, pseudovarieties of regular languages to that of synchronous +relations, and we show how algebraic characterizations of pseudovarieties of +regular languages can be lifted to the pseudovarieties of synchronous relations +that they induce. A typical example of such a pseudovariety is the class of +"group relations", defined as the relations recognized by finite-state +synchronous permutation automata. + In order to prove this result, we adapt two pillars of algebraic language to +synchronous algebras: (a) any relation admits a syntactic synchronous algebra +recognizing it, and moreover, the relation is synchronous if, and only if, its +syntactic algebra is finite and (b) classes of synchronous relations with +desirable closure properties (i.e. pseudovarieties) correspond to +pseudovarieties of synchronous algebras. + +
+
+
+
+
+ + ♻ ☆ Quantum Büchi Automata + + +
+ Quantum finite automata (QFAs) have been extensively studied in the +literature. In this paper, we define and systematically study quantum B\"uchi +automata (QBAs) over infinite words to model the long-term behavior of quantum +systems, which extend QFAs. We introduce the classes of $\omega$-languages +recognized by QBAs in probable, almost sure, strict and non-strict threshold +semantics. Several pumping lemmas and closure properties for QBAs are proved. +Some decision problems for QBAs are investigated. In particular, we show that +there are surprisingly only at most four substantially different classes of +$\omega$-languages recognized by QBAs (out of uncountably infinite). The +relationship between classical $\omega$-languages and QBAs is clarified using +our pumping lemmas. We also find an $\omega$-language recognized by QBAs under +the almost sure semantics, which is not $\omega$-context-free. + +
+
+ comment: Full final version. 40 pages, 1 figure, 2 tables. Removed [v1]'s + disturbing acceptance for simplicity, so all QBAs in [v2] obeys + non-disturbing acceptance +
+
+
+
+
+ + ♻ ☆ A Faithful and Quantitative Notion of Distant Reduction for the + Lambda-Calculus with Generalized Applications + + +
+ We introduce a call-by-name lambda-calculus $\lambda Jn$ with generalized +applications which is equipped with distant reduction. This allows to unblock +$\beta$-redexes without resorting to the standard permutative conversions of +generalized applications used in the original $\Lambda J$-calculus with +generalized applications of Joachimski and Matthes. We show strong +normalization of simply-typed terms, and we then fully characterize strong +normalization by means of a quantitative (i.e. non-idempotent intersection) +typing system. This characterization uses a non-trivial inductive definition of +strong normalization --related to others in the literature--, which is based on +a weak-head normalizing strategy. We also show that our calculus $\lambda Jn$ +relates to explicit substitution calculi by means of a faithful translation, in +the sense that it preserves strong normalization. Moreover, our calculus +$\lambda Jn$ and the original $\Lambda J$-calculus determine equivalent notions +of strong normalization. As a consequence, $\lambda J$ inherits a faithful +translation into explicit substitutions, and its strong normalization can also +be characterized by the quantitative typing system designed for $\lambda Jn$, +despite the fact that quantitative subject reduction fails for permutative +conversions. + +
+
+
+
+
+
+
+
+ + Computational Complexity 1 + +
+
+
+ + ♻ ☆ On Degeneracy in the P-Matroid Oriented Matroid Complementarity Problem + + +
+ Klaus showed that the Oriented Matroid Complementarity Problem (OMCP) can be +solved by a reduction to the problem of sink-finding in a unique sink +orientation (USO) if the input is promised to be given by a non-degenerate +extension of a P-matroid. In this paper, we investigate the effect of +degeneracy on this reduction. On the one hand, this understanding of +degeneracies allows us to prove a linear lower bound on the number of vertex +evaluations required for sink-finding in P-matroid USOs, the set of USOs +obtainable through Klaus' reduction. On the other hand, it allows us to adjust +Klaus' reduction to also work with degenerate instances. Furthermore, we +introduce a total search version of the P-Matroid Oriented Matroid +Complementarity Problem (P-OMCP). Given any extension of any oriented matroid +M, by reduction to a total search version of USO sink-finding we can either +solve the OMCP, or provide a polynomial-time verifiable certificate that M is +not a P-matroid. This places the total search version of the P-OMCP in the +complexity class Unique End of Potential Line (UEOPL). + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 4 + +
+
+
+ + ☆ Tight Bounds for the Number of Absent Scattered Factors + + +
+ A scattered factor of a word $w$ is a word $u$ that can be obtained by +deleting arbitary letters from $w$ and keep the order of the remaining. Barker +et al. introduced the notion of $k$-universality, calling a word $k$-universal, +if it contains all possible words of length $k$ over a given alphabet $\Sigma$ +as a scattered factor. Kosche et al. introduced the notion of absent scattered +factors to categorise the words not being scattered factors of a given word. + In this paper, we investigate tight bounds on the possible number of absent +scattered factors of a given length $k$ (also strictly longer than the shortest +absent scattered factors) among all words with the same universality extending +the results of Kosche et al. Specifically, given a length $k$ and universality +index $\iota$, we characterize $\iota$-universal words with both the maximal +and minimal number of absent scattered factors of length $k$. For the lower +bound, we provide the exact number in a closed form. For the upper bound, we +offer efficient algorithms to compute the number based on the constructed +words. Moreover, by combining old results, we present an enumeration with +constant delay of the set of scattered factors of a fixed length in time +$O(|\Sigma||w|)$. + +
+
+
+
+
+ + ♻ ☆ The Algebras for Automatic Relations + + +
+ We introduce "synchronous algebras", an algebraic structure tailored to +recognize automatic relations (aka. synchronous relations, or regular +relations). They are the equivalent of monoids for regular languages, however +they conceptually differ in two points: first, they are typed and second, they +are equipped with a dependency relation expressing constraints between elements +of different types. + The interest of the proposed definition is that it allows to lift, in an +effective way, pseudovarieties of regular languages to that of synchronous +relations, and we show how algebraic characterizations of pseudovarieties of +regular languages can be lifted to the pseudovarieties of synchronous relations +that they induce. A typical example of such a pseudovariety is the class of +"group relations", defined as the relations recognized by finite-state +synchronous permutation automata. + In order to prove this result, we adapt two pillars of algebraic language to +synchronous algebras: (a) any relation admits a syntactic synchronous algebra +recognizing it, and moreover, the relation is synchronous if, and only if, its +syntactic algebra is finite and (b) classes of synchronous relations with +desirable closure properties (i.e. pseudovarieties) correspond to +pseudovarieties of synchronous algebras. + +
+
+
+
+
+ + ♻ ☆ Quantum Büchi Automata + + +
+ Quantum finite automata (QFAs) have been extensively studied in the +literature. In this paper, we define and systematically study quantum B\"uchi +automata (QBAs) over infinite words to model the long-term behavior of quantum +systems, which extend QFAs. We introduce the classes of $\omega$-languages +recognized by QBAs in probable, almost sure, strict and non-strict threshold +semantics. Several pumping lemmas and closure properties for QBAs are proved. +Some decision problems for QBAs are investigated. In particular, we show that +there are surprisingly only at most four substantially different classes of +$\omega$-languages recognized by QBAs (out of uncountably infinite). The +relationship between classical $\omega$-languages and QBAs is clarified using +our pumping lemmas. We also find an $\omega$-language recognized by QBAs under +the almost sure semantics, which is not $\omega$-context-free. + +
+
+ comment: Full final version. 40 pages, 1 figure, 2 tables. Removed [v1]'s + disturbing acceptance for simplicity, so all QBAs in [v2] obeys + non-disturbing acceptance +
+
+
+
+
+ + ♻ ☆ Conjugacy for certain automorphisms of the one-sided shift via + transducers + + +
+ We address the following open problem, implicit in the 1990 article +"Automorphisms of one-sided subshifts of finite type" of Boyle, Franks and +Kitchens (BFK): + "Does there exists an element $\psi$ in the group of automorphisms of the +one-sided shift $\operatorname{Aut}(\{0,1,\ldots,n-1\}^{\mathbb{N}}, +\sigma_{n})$ so that all points of $\{0,1,\ldots,n-1\}^{\mathbb{N}}$ have +orbits of length $n$ under $\psi$ and $\psi$ is not conjugate to a +permutation?" + Here, by a 'permutation' we mean an automorphism of one-sided shift dynamical +system induced by a permutation of the symbol set $\{0,1,\ldots,n-1\}$. + We resolve this question by showing that any $\psi$ with properties as above +must be conjugate to a permutation. + Our techniques naturally extend those of BFK using the strongly synchronizing +automata technology developed here and in several articles of the authors and +collaborators (although, this article has been written to be largely +self-contained). + +
+
+ comment: 40 pages, 9 Figures. This new version simplifies the exposition and + proof approach. Some of the relabelling lemmas have been replaced by more + targeted and "slimmed" down versions. We are grateful to an anonymous referee + for detailed and very helpful comments on an earlier draft of the article +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Hardware Architecturea 6 + +
+
+
+ + ☆ SuperFlow: A Fully-Customized RTL-to-GDS Design Automation Flow for + Adiabatic Quantum-Flux-Parametron Superconducting Circuits DATE 2024 + + +
+ Superconducting circuits, like Adiabatic Quantum-Flux-Parametron (AQFP), +offer exceptional energy efficiency but face challenges in physical design due +to sophisticated spacing and timing constraints. Current design tools often +neglect the importance of constraint adherence throughout the entire design +flow. In this paper, we propose SuperFlow, a fully-customized RTL-to-GDS design +flow tailored for AQFP devices. SuperFlow leverages a synthesis tool based on +CMOS technology to transform any input RTL netlist to an AQFP-based netlist. +Subsequently, we devise a novel place-and-route procedure that simultaneously +considers wirelength, timing, and routability for AQFP circuits. The process +culminates in the generation of the AQFP circuit layout, followed by a Design +Rule Check (DRC) to identify and rectify any layout violations. Our +experimental results demonstrate that SuperFlow achieves 12.8% wirelength +improvement on average and 12.1% better timing quality compared with previous +state-of-the-art placers for AQFP circuits. + +
+
+ comment: Accepted by DATE 2024 +
+
+
+
+
+ + ☆ MapTune: Advancing ASIC Technology Mapping via Reinforcement Learning + Guided Library Tuning + + +
+ Technology mapping involves mapping logical circuits to a library of cells. +Traditionally, the full technology library is used, leading to a large search +space and potential overhead. Motivated by randomly sampled technology mapping +case studies, we propose MapTune framework that addresses this challenge by +utilizing reinforcement learning to make design-specific choices during cell +selection. By learning from the environment, MapTune refines the cell selection +process, resulting in a reduced search space and potentially improved mapping +quality. + The effectiveness of MapTune is evaluated on a wide range of benchmarks, +different technology libraries and technology mappers. The experimental results +demonstrate that MapTune achieves higher mapping accuracy and reducing +delay/area across diverse circuit designs, technology libraries and mappers. +The paper also discusses the Pareto-Optimal exploration and confirms the +perpetual delay-area trade-off. Conducted on benchmark suites ISCAS 85/89, +ITC/ISCAS 99, VTR8.0 and EPFL benchmarks, the post-technology mapping and +post-sizing quality-of-results (QoR) have been significantly improved, with +average Area-Delay Product (ADP) improvement of 22.54\% among all different +exploration settings in MapTune. The improvements are consistently remained for +four different technologies (7nm, 45nm, 130nm, and 180 nm) and two different +mappers. + +
+
+ comment: IEEE/ACM International Conference on Computer-Aided Design (ICCAD + '24), October 27--31, 2024 +
+
+
+
+
+ + ☆ HG-PIPE: Vision Transformer Acceleration with Hybrid-Grained Pipeline + + +
+ Vision Transformer (ViT) acceleration with field programmable gate array +(FPGA) is promising but challenging. Existing FPGA-based ViT accelerators +mainly rely on temporal architectures, which process different operators by +reusing the same hardware blocks and suffer from extensive memory access +overhead. Pipelined architectures, either coarse-grained or fine-grained, +unroll the ViT computation spatially for memory access efficiency. However, +they usually suffer from significant hardware resource constraints and pipeline +bubbles induced by the global computation dependency of ViT. In this paper, we +introduce HG-PIPE, a pipelined FPGA accelerator for high-throughput and +low-latency ViT processing. HG-PIPE features a hybrid-grained pipeline +architecture to reduce on-chip buffer cost and couples the computation dataflow +and parallelism design to eliminate the pipeline bubbles. HG-PIPE further +introduces careful approximations to implement both linear and non-linear +operators with abundant Lookup Tables (LUTs), thus alleviating resource +constraints. On a ZCU102 FPGA, HG-PIPE achieves 2.78 times better throughput +and 2.52 times better resource efficiency than the prior-art accelerators, +e.g., AutoViTAcc. With a VCK190 FPGA, HG-PIPE realizes end-to-end ViT +acceleration on a single device and achieves 7118 images/s, which is 2.81 times +faster than a V100 GPU. + +
+
+ comment: Accepted by ICCAD 2024 +
+
+
+
+
+ + ☆ Exploring the Limitations of Kolmogorov-Arnold Networks in + Classification: Insights to Software Training and Hardware Implementation + + +
+ Kolmogorov-Arnold Networks (KANs), a novel type of neural network, have +recently gained popularity and attention due to the ability to substitute +multi-layer perceptions (MLPs) in artificial intelligence (AI) with higher +accuracy and interoperability. However, KAN assessment is still limited and +cannot provide an in-depth analysis of a specific domain. Furthermore, no study +has been conducted on the implementation of KANs in hardware design, which +would directly demonstrate whether KANs are truly superior to MLPs in practical +applications. As a result, in this paper, we focus on verifying KANs for +classification issues, which are a common but significant topic in AI using +four different types of datasets. Furthermore, the corresponding hardware +implementation is considered using the Vitis high-level synthesis (HLS) tool. +To the best of our knowledge, this is the first article to implement hardware +for KAN. The results indicate that KANs cannot achieve more accuracy than MLPs +in high complex datasets while utilizing substantially higher hardware +resources. Therefore, MLP remains an effective approach for achieving accuracy +and efficiency in software and hardware implementation. + +
+
+ comment: 6 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ CHOSEN: Compilation to Hardware Optimization Stack for Efficient Vision + Transformer Inference + + +
+ Vision Transformers (ViTs) represent a groundbreaking shift in machine +learning approaches to computer vision. Unlike traditional approaches, ViTs +employ the self-attention mechanism, which has been widely used in natural +language processing, to analyze image patches. Despite their advantages in +modeling visual tasks, deploying ViTs on hardware platforms, notably +Field-Programmable Gate Arrays (FPGAs), introduces considerable challenges. +These challenges stem primarily from the non-linear calculations and high +computational and memory demands of ViTs. This paper introduces CHOSEN, a +software-hardware co-design framework to address these challenges and offer an +automated framework for ViT deployment on the FPGAs in order to maximize +performance. Our framework is built upon three fundamental contributions: +multi-kernel design to maximize the bandwidth, mainly targeting benefits of +multi DDR memory banks, approximate non-linear functions that exhibit minimal +accuracy degradation, and efficient use of available logic blocks on the FPGA, +and efficient compiler to maximize the performance and memory-efficiency of the +computing kernels by presenting a novel algorithm for design space exploration +to find optimal hardware configuration that achieves optimal throughput and +latency. Compared to the state-of-the-art ViT accelerators, CHOSEN achieves a +1.5x and 1.42x improvement in the throughput on the DeiT-S and DeiT-B models. + +
+
+
+
+
+ + ♻ ☆ Understanding the Security Benefits and Overheads of Emerging Industry + Solutions to DRAM Read Disturbance + + +
+ We present the first rigorous security, performance, energy, and cost +analyses of the state-of-the-art on-DRAM-die read disturbance mitigation +method, Per Row Activation Counting (PRAC), described in JEDEC DDR5 +specification's April 2024 update. Unlike prior state-of-the-art that advises +the memory controller to periodically issue refresh management (RFM) commands, +which provides the DRAM chip with time to perform refreshes, PRAC introduces a +new back-off signal. PRAC's back-off signal propagates from the DRAM chip to +the memory controller and forces the memory controller to 1) stop serving +requests and 2) issue RFM commands. As a result, RFM commands are issued when +needed as opposed to periodically, reducing RFM's overheads. We analyze PRAC in +four steps. First, we define an adversarial access pattern that represents the +worst-case for PRAC's security. Second, we investigate PRAC's configurations +and security implications. Our analyses show that PRAC can be configured for +secure operation as long as no bitflip occurs before accessing a memory +location 10 times. Third, we evaluate the performance impact of PRAC and +compare it against prior works using Ramulator 2.0. Our analysis shows that +while PRAC incurs less than 13% performance overhead for today's DRAM chips, +its performance overheads can reach up to 94% for future DRAM chips that are +more vulnerable to read disturbance bitflips. Fourth, we define an availability +adversarial access pattern that exacerbates PRAC's performance overhead to +perform a memory performance attack, demonstrating that such an adversarial +pattern can hog up to 94% of DRAM throughput and degrade system throughput by +up to 95%. We discuss PRAC's implications on future systems and foreshadow +future research directions. To aid future research, we open-source our +implementations and scripts at https://github.com/CMU-SAFARI/ramulator2. + +
+
+ comment: To appear in DRAMSec 2024 +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 18 + +
+
+
+ + ☆ Differentiable Quantum Architecture Search in Asynchronous Quantum + Reinforcement Learning + + +
+ The emergence of quantum reinforcement learning (QRL) is propelled by +advancements in quantum computing (QC) and machine learning (ML), particularly +through quantum neural networks (QNN) built on variational quantum circuits +(VQC). These advancements have proven successful in addressing sequential +decision-making tasks. However, constructing effective QRL models demands +significant expertise due to challenges in designing quantum circuit +architectures, including data encoding and parameterized circuits, which +profoundly influence model performance. In this paper, we propose addressing +this challenge with differentiable quantum architecture search (DiffQAS), +enabling trainable circuit parameters and structure weights using +gradient-based optimization. Furthermore, we enhance training efficiency +through asynchronous reinforcement learning (RL) methods facilitating parallel +training. Through numerical simulations, we demonstrate that our proposed +DiffQAS-QRL approach achieves performance comparable to manually-crafted +circuit architectures across considered environments, showcasing stability +across diverse scenarios. This methodology offers a pathway for designing QRL +models without extensive quantum knowledge, ensuring robust performance and +fostering broader application of QRL. + +
+
+ comment: Accepted by IEEE International Conference on Quantum Computing and + Engineering - QCE 2024 +
+
+
+
+
+ + ☆ Sparse Incremental Aggregation in Multi-Hop Federated Learning SP + + +
+ This paper investigates federated learning (FL) in a multi-hop communication +setup, such as in constellations with inter-satellite links. In this setup, +part of the FL clients are responsible for forwarding other client's results to +the parameter server. Instead of using conventional routing, the communication +efficiency can be improved significantly by using in-network model aggregation +at each intermediate hop, known as incremental aggregation (IA). Prior works +[1] have indicated diminishing gains for IA under gradient sparsification. Here +we study this issue and propose several novel correlated sparsification methods +for IA. Numerical results show that, for some of these algorithms, the full +potential of IA is still available under sparsification without impairing +convergence. We demonstrate a 15x improvement in communication efficiency over +conventional routing and a 11x improvement over state-of-the-art (SoA) sparse +IA. + +
+
+ comment: This paper is accepted for the 25th IEEE International Workshop on + Signal Processing Advances in Wireless Communications (SPAWC) conference +
+
+
+
+
+ + ☆ StraightLine: An End-to-End Resource-Aware Scheduler for Machine + Learning Application Requests + + +
+ The life cycle of machine learning (ML) applications consists of two stages: +model development and model deployment. However, traditional ML systems (e.g., +training-specific or inference-specific systems) focus on one particular stage +or phase of the life cycle of ML applications. These systems often aim at +optimizing model training or accelerating model inference, and they frequently +assume homogeneous infrastructure, which may not always reflect real-world +scenarios that include cloud data centers, local servers, containers, and +serverless platforms. We present StraightLine, an end-to-end resource-aware +scheduler that schedules the optimal resources (e.g., container, virtual +machine, or serverless) for different ML application requests in a hybrid +infrastructure. The key innovation is an empirical dynamic placing algorithm +that intelligently places requests based on their unique characteristics (e.g., +request frequency, input data size, and data distribution). In contrast to +existing ML systems, StraightLine offers end-to-end resource-aware placement, +thereby it can significantly reduce response time and failure rate for model +deployment when facing different computing resources in the hybrid +infrastructure. + +
+
+ comment: 6 pages, 8 figures, to appear in AIoTC'24 +
+
+
+
+
+ + ☆ $k$-Center Clustering in Distributed Models + + +
+ The $k$-center problem is a central optimization problem with numerous +applications for machine learning, data mining, and communication networks. +Despite extensive study in various scenarios, it surprisingly has not been +thoroughly explored in the traditional distributed setting, where the +communication graph of a network also defines the distance metric. + We initiate the study of the $k$-center problem in a setting where the +underlying metric is the graph's shortest path metric in three canonical +distributed settings: the LOCAL, CONGEST, and CLIQUE models. Our results +encompass constant-factor approximation algorithms and lower bounds in these +models, as well as hardness results for the bi-criteria approximation setting. + +
+
+ comment: Presented in SIROCCO'24 conference +
+
+
+
+
+ + ☆ Optimal Broadcast Schedules in Logarithmic Time with Applications to + Broadcast, All-Broadcast, Reduction and All-Reduction + + +
+ We give optimally fast $O(\log p)$ time (per processor) algorithms for +computing round-optimal broadcast schedules for message-passing parallel +computing systems. This affirmatively answers difficult questions posed in a +SPAA 2022 BA and a CLUSTER 2022 paper. We observe that the computed schedules +and circulant communication graph can likewise be used for reduction, +all-broadcast and all-reduction as well, leading to new, round-optimal +algorithms for these problems. These observations affirmatively answer open +questions posed in a CLUSTER 2023 paper. + The problem is to broadcast $n$ indivisible blocks of data from a given root +processor to all other processors in a (subgraph of a) fully connected network +of $p$ processors with fully bidirectional, one-ported communication +capabilities. In this model, $n-1+\lceil\log_2 p\rceil$ communication rounds +are required. Our new algorithms compute for each processor in the network +receive and send schedules each of size $\lceil\log_2 p\rceil$ that determine +uniquely in $O(1)$ time for each communication round the new block that the +processor will receive, and the already received block it has to send. Schedule +computations are done independently per processor without communication. The +broadcast communication subgraph is an easily computable, directed, +$\lceil\log_2 p\rceil$-regular circulant graph also used elsewhere. We show how +the schedule computations can be done in optimal time and space of $O(\log p)$, +improving significantly over previous results of $O(p\log^2 p)$ and $O(\log^3 +p)$, respectively. The schedule computation and broadcast algorithms are simple +to implement, but correctness and complexity are not obvious. The schedules are +used for new implementations of the MPI (Message-Passing Interface) collectives +MPI_Bcast, MPI_Allgatherv, MPI_Reduce and MPI_Reduce_scatter. Preliminary +experimental results are given. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2312.11236 +
+
+
+
+
+ + ☆ DualFed: Enjoying both Generalization and Personalization in Federated + Learning via Hierachical Representations + + +
+ In personalized federated learning (PFL), it is widely recognized that +achieving both high model generalization and effective personalization poses a +significant challenge due to their conflicting nature. As a result, existing +PFL methods can only manage a trade-off between these two objectives. This +raises an interesting question: Is it feasible to develop a model capable of +achieving both objectives simultaneously? Our paper presents an affirmative +answer, and the key lies in the observation that deep models inherently exhibit +hierarchical architectures, which produce representations with various levels +of generalization and personalization at different stages. A straightforward +approach stemming from this observation is to select multiple representations +from these layers and combine them to concurrently achieve generalization and +personalization. However, the number of candidate representations is commonly +huge, which makes this method infeasible due to high computational costs.To +address this problem, we propose DualFed, a new method that can directly yield +dual representations correspond to generalization and personalization +respectively, thereby simplifying the optimization task. Specifically, DualFed +inserts a personalized projection network between the encoder and classifier. +The pre-projection representations are able to capture generalized information +shareable across clients, and the post-projection representations are effective +to capture task-specific information on local clients. This design minimizes +the mutual interference between generalization and personalization, thereby +achieving a win-win situation. Extensive experiments show that DualFed can +outperform other FL methods. Code is available at +https://github.com/GuogangZhu/DualFed. + +
+
+ comment: Accepted by ACM MutltiMedia 2024 +
+
+
+
+
+ + ☆ SOK: Blockchain for Provenance + + +
+ Provenance, which traces data from its creation to manipulation, is crucial +for ensuring data integrity, reliability, and trustworthiness. It is valuable +for single-user applications, collaboration within organizations, and across +organizations. Blockchain technology has become a popular choice for +implementing provenance due to its distributed, transparent, and immutable +nature. Numerous studies on blockchain designs are specifically dedicated to +provenance, and specialize in this area. Our goal is to provide a new +perspective in blockchain based provenance field by identifying the challenges +faced and suggesting future research directions. In this paper, we categorize +the problem statement into three main research questions to investigate key +issues comprehensively and propose a new outlook on the use of blockchains. The +first focuses on challenges in non-collaborative, single-source environments, +the second examines implications in collaborative environments and different +domains such as supply chain, scientific collaboration and digital forensic, +and the last one analyzes communication and data exchange challenges between +organizations using different blockchains. The interconnected nature of these +research questions ensures a thorough exploration of provenance requirements, +leading to more effective and secure systems. After analyzing the requirements +of provenance in different environments, we provide future design +considerations for provenance-based blockchains, including blockchain type, +query mechanisms, provenance capture methods, and domain-specific +considerations. We also discuss future work and possible extensions in this +field. + +
+
+
+
+
+ + ☆ Empowering the Quantum Cloud User with QRIO + + +
+ Quantum computing is moving swiftly from theoretical to practical +applications, making it crucial to establish a significant quantum advantage. +Despite substantial investments, access to quantum devices is still limited, +with users facing issues like long wait times and inefficient resource +management. Unlike the mature cloud solutions for classical computing, quantum +computing lacks effective infrastructure for resource optimization. We propose +a Quantum Resource Infrastructure Orchestrator (QRIO), a state-of-the-art cloud +resource manager built on Kubernetes that is tailored to quantum computing. +QRIO seeks to democratize access to quantum devices by providing customizable, +user-friendly, open-source resource management. QRIO's design aims to ensure +equitable access, optimize resource utilization, and support diverse +applications, thereby speeding up innovation and making quantum computing more +accessible and efficient to a broader user base. In this paper, we discuss +QRIO's various features and evaluate its capability in several representative +usecases. + +
+
+
+
+
+ + ☆ SCALE: Self-regulated Clustered federAted LEarning in a Homogeneous + Environment + + +
+ Federated Learning (FL) has emerged as a transformative approach for enabling +distributed machine learning while preserving user privacy, yet it faces +challenges like communication inefficiencies and reliance on centralized +infrastructures, leading to increased latency and costs. This paper presents a +novel FL methodology that overcomes these limitations by eliminating the +dependency on edge servers, employing a server-assisted Proximity Evaluation +for dynamic cluster formation based on data similarity, performance indices, +and geographical proximity. Our integrated approach enhances operational +efficiency and scalability through a Hybrid Decentralized Aggregation Protocol, +which merges local model training with peer-to-peer weight exchange and a +centralized final aggregation managed by a dynamically elected driver node, +significantly curtailing global communication overhead. Additionally, the +methodology includes Decentralized Driver Selection, Check-pointing to reduce +network traffic, and a Health Status Verification Mechanism for system +robustness. Validated using the breast cancer dataset, our architecture not +only demonstrates a nearly tenfold reduction in communication overhead but also +shows remarkable improvements in reducing training latency and energy +consumption while maintaining high learning performance, offering a scalable, +efficient, and privacy-preserving solution for the future of federated learning +ecosystems. + +
+
+ comment: This research article got accepted in COMPSAC conference and going to + be published to IEEE +
+
+
+
+
+ + ☆ Leveraging Core and Uncore Frequency Scaling for Power-Efficient + Serverless Workflows + + +
+ Serverless workflows have emerged in FaaS platforms to represent the +operational structure of traditional applications. With latency propagation +effects becoming increasingly prominent, step-wise resource tuning is required +to address the end-to-end Quality-of-Service (QoS) requirements. Modern +processors' allowance for fine-grained Dynamic Voltage and Frequency Scaling +(DVFS), coupled with the intermittent nature of serverless workflows presents a +unique opportunity to reduce power while meeting QoS. + In this paper, we introduce a QoS-aware DVFS framework for serverless +workflows. {\Omega}kypous regulates the end-to-end latency of serverless +workflows by supplying the system with the Core/Uncore frequency combination +that minimizes power consumption. With Uncore DVFS enriching the efficient +power configurations space, we devise a grey-box model that accurately projects +functions' execution latency and power, to the applied Core and Uncore +frequency combination. To the best of our knowledge, {\Omega}kypous is the +first work that leverages Core and Uncore DVFS as an integral part of +serverless workflows. Our evaluation on the analyzed Azure Trace, against +state-of-the-art (SotA) power managers, demonstrates an average power +consumption reduction of 9% (up to 21%) while minimizing QoS violations. + +
+
+
+
+
+ + ☆ FADAS: Towards Federated Adaptive Asynchronous Optimization + + +
+ Federated learning (FL) has emerged as a widely adopted training paradigm for +privacy-preserving machine learning. While the SGD-based FL algorithms have +demonstrated considerable success in the past, there is a growing trend towards +adopting adaptive federated optimization methods, particularly for training +large-scale models. However, the conventional synchronous aggregation design +poses a significant challenge to the practical deployment of those adaptive +federated optimization methods, particularly in the presence of straggler +clients. To fill this research gap, this paper introduces federated adaptive +asynchronous optimization, named FADAS, a novel method that incorporates +asynchronous updates into adaptive federated optimization with provable +guarantees. To further enhance the efficiency and resilience of our proposed +method in scenarios with significant asynchronous delays, we also extend FADAS +with a delay-adaptive learning adjustment strategy. We rigorously establish the +convergence rate of the proposed algorithms and empirical results demonstrate +the superior performance of FADAS over other asynchronous FL baselines. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ☆ Generative AI like ChatGPT in Blockchain Federated Learning: use cases, + opportunities and future + + +
+ Federated learning has become a significant approach for training machine +learning models using decentralized data without necessitating the sharing of +this data. Recently, the incorporation of generative artificial intelligence +(AI) methods has provided new possibilities for improving privacy, augmenting +data, and customizing models. This research explores potential integrations of +generative AI in federated learning, revealing various opportunities to enhance +privacy, data efficiency, and model performance. It particularly emphasizes the +importance of generative models like generative adversarial networks (GANs) and +variational autoencoders (VAEs) in creating synthetic data that replicates the +distribution of real data. Generating synthetic data helps federated learning +address challenges related to limited data availability and supports robust +model development. Additionally, we examine various applications of generative +AI in federated learning that enable more personalized solutions. + +
+
+ comment: We are going to submit this research article into a conference which + is best fit for this topic +
+
+
+
+
+ + ☆ HPAC-ML: A Programming Model for Embedding ML Surrogates in Scientific + Applications SC24 + + +
+ The end of Dennard scaling and the slowdown of Moore's Law led to +heterogeneous architectures benefiting machine learning (ML) algorithms. These +hardware advancements and the development of intuitive domain-specific +languages have made ML more accessible, democratizing innovation. ML models +surpass traditional approximation limits, broadening opportunities and evolving +from statistical to complex function modeling. Consequently, scientific +applications leverage ML models for enhanced execution speeds. However, +integrating ML models remains manual and complex, slowing the adoption of ML as +an approximation technique in modern applications. + We propose an easy-to-use directive-based programming model that enables +developers to describe the use of ML models in scientific applications. The +runtime support, as instructed by the programming model, performs data +assimilation using the original algorithm and can replace the algorithm with +model inference. Our evaluation across five benchmarks, testing over 5000 ML +models, shows up to 83.6x speed improvements with minimal accuracy loss (as low +as 0.01 RMSE). + +
+
+ comment: 16 pages, 9 figures. Accepted at SC24 +
+
+
+
+
+ + ♻ ☆ SoK: Bridging Trust into the Blockchain. A Systematic Review on On-Chain + Identity + + +
+ The ongoing regulation of blockchain-based services and applications requires +the identification of users who are issuing transactions on the blockchain. +This systematic review explores the current status, identifies research gaps, +and outlines future research directions for establishing trusted and +privacy-compliant identities on the blockchain (on-chain identity). A +systematic search term was applied across various scientific databases, +collecting 2232 potentially relevant research papers. These papers were +narrowed down in two methodologically executed steps to 98 and finally to 13 +relevant sources. The relevant articles were then systematically analyzed based +on a set of screening questions. The results of the selected studies have +provided insightful findings on the mechanisms of on-chain identities. On-chain +identities are established using zero-knowledge proofs, public key +infrastructure/certificates, and web of trust approaches. The technologies and +architectures used by the authors are also highlighted. Trust has emerged as a +key research gap, manifesting in two ways: firstly, a gap in how to trust the +digital identity representation of a physical human; secondly, a gap in how to +trust identity providers that issue identity confirmations on-chain. Potential +future research avenues are suggested to help fill the current gaps in +establishing trust and on-chain identities. + +
+
+
+
+
+ + ♻ ☆ GeoFaaS: An Edge-to-Cloud FaaS Platform + + +
+ The massive growth of mobile and IoT devices demands geographically +distributed computing systems for optimal performance, privacy, and +scalability. However, existing edge-to-cloud serverless platforms lack location +awareness, resulting in inefficient network usage and increased latency. + In this paper, we propose GeoFaaS, a novel edge-to-cloud +Function-as-a-Service (FaaS) platform that leverages real-time client location +information for transparent request execution on the nearest available FaaS +node. If needed, GeoFaaS transparently offloads requests to the cloud when edge +resources are overloaded, thus, ensuring consistent execution without user +intervention. GeoFaaS has a modular and decentralized architecture: building on +the single-node FaaS system tinyFaaS, GeoFaaS works as a stand-alone +edge-to-cloud FaaS platform but can also integrate and act as a routing layer +for existing FaaS services, e.g., in the cloud. To evaluate our approach, we +implemented an open-source proof-of-concept prototype and studied performance +and fault-tolerance behavior in experiments. + +
+
+ comment: Accepted for publication in 12th IEEE International Conference on + Cloud Engineering (IC2E 2024) +
+
+
+
+
+ + ♻ ☆ ServerlessLLM: Low-Latency Serverless Inference for Large Language + Models + + +
+ This paper presents ServerlessLLM, a distributed system designed to support +low-latency serverless inference for Large Language Models (LLMs). By +harnessing the substantial near-GPU storage and memory capacities of inference +servers, ServerlessLLM achieves effective local checkpoint storage, minimizing +the need for remote checkpoint downloads and ensuring efficient checkpoint +loading. The design of ServerlessLLM features three core contributions: (i) +\emph{fast multi-tier checkpoint loading}, featuring a new loading-optimized +checkpoint format and a multi-tier loading system, fully utilizing the +bandwidth of complex storage hierarchies on GPU servers; (ii) \emph{efficient +live migration of LLM inference}, which enables newly initiated inferences to +capitalize on local checkpoint storage while ensuring minimal user +interruption; and (iii) \emph{startup-time-optimized model scheduling}, which +assesses the locality statuses of checkpoints on each server and schedules the +model onto servers that minimize the time to start the inference. Comprehensive +evaluations, including microbenchmarks and real-world scenarios, demonstrate +that ServerlessLLM dramatically outperforms state-of-the-art serverless +systems, reducing latency by 10 - 200X across various LLM inference workloads. + +
+
+ comment: 18th USENIX Symposium on Operating Systems Design and Implementation +
+
+
+
+
+ + ♻ ☆ High Significant Fault Detection in Azure Core Workload Insights + + +
+ Azure Core workload insights have time-series data with different metric +units. Faults or Anomalies are observed in these time-series data owing to +faults observed with respect to metric name, resources region, dimensions, and +its dimension value associated with the data. For Azure Core, an important task +is to highlight faults or anomalies to the user on a dashboard that they can +perceive easily. The number of anomalies reported should be highly significant +and in a limited number, e.g., 5-20 anomalies reported per hour. The reported +anomalies will have significant user perception and high reconstruction error +in any time-series forecasting model. Hence, our task is to automatically +identify 'high significant anomalies' and their associated information for user +perception. + +
+
+ comment: Published in IAAI 2024, which is the Industrial track of AAAI 2024 +
+
+
+
+
+ + ♻ ☆ How to Rent GPUs on a Budget + + +
+ The explosion in Machine Learning (ML) over the past ten years has led to a +dramatic increase in demand for GPUs to train ML models. Because it is +prohibitively expensive for most users to build and maintain a large GPU +cluster, large cloud providers (Microsoft Azure, Amazon AWS, Google Cloud) have +seen explosive growth in demand for renting cloud-based GPUs. In this +cloud-computing paradigm, a user must specify their demand for GPUs at every +moment in time, and will pay for every GPU-hour they use. ML training jobs are +known to be parallelizable to different degrees. Given a stream of ML training +jobs, a user typically wants to minimize the mean response time across all +jobs. Here, the response time of a job denotes the time from when a job arrives +until it is complete. Additionally, the user is constrained by some operating +budget. Specifically, in this paper the user is constrained to use no more than +$b$ GPUs per hour, over a long-run time average. The question is how to +minimize mean response time while meeting the budget constraint. Because +training jobs receive a diminishing marginal benefit from running on additional +GPUs, allocating too many GPUs to a single training job can dramatically +increase the overall cost paid by the user. Hence, an optimal rental policy +must balance a tradeoff between training cost and mean response time. This +paper derives the optimal rental policy for a stream of training jobs where the +jobs have different levels of parallelizability (specified by a speedup +function) and different job sizes (amounts of inherent work). We make almost no +assumptions about the arrival process and about the job size distribution. Our +optimal policy specifies how many GPUs to rent at every moment in time and how +to allocate these GPUs. + +
+
+
+
+
+
+
+
+ + Operation Systems 3 + +
+
+
+ + ☆ Rusty Linux: Advances in Rust for Linux Kernel Development + + +
+ Context: The integration of Rust into kernel development is a transformative +endeavor aimed at enhancing system security and reliability by leveraging +Rust's strong memory safety guarantees. Objective: We aim to find the current +advances in using Rust in Kernel development to reduce the number of memory +safety vulnerabilities in one of the most critical pieces of software that +underpins all modern applications. Method: By analyzing a broad spectrum of +studies, we identify the advantages Rust offers, highlight the challenges +faced, and emphasize the need for community consensus on Rust's adoption. +Results: Our findings suggest that while the initial implementations of Rust in +the kernel show promising results in terms of safety and stability, significant +challenges remain. These challenges include achieving seamless interoperability +with existing kernel components, maintaining performance, and ensuring adequate +support and tooling for developers. Conclusions: This study underscores the +need for continued research and practical implementation efforts to fully +realize the benefits of Rust. By addressing these challenges, the integration +of Rust could mark a significant step forward in the evolution of operating +system development towards safer and more reliable systems + +
+
+ comment: This paper has been accepted for publication and presentation at ESEM + 2024 Emerging Results, Vision and Reflection Papers Track to be held in + Barcelona, Spain on October 24-25, 2024 +
+
+
+
+
+ + ☆ Design and demonstration of an operating system for executing + applications on quantum network nodes + + +
+ The goal of future quantum networks is to enable new internet applications +that are impossible to achieve using solely classical communication. Up to now, +demonstrations of quantum network applications and functionalities on quantum +processors have been performed in ad-hoc software that was specific to the +experimental setup, programmed to perform one single task (the application +experiment) directly into low-level control devices using expertise in +experimental physics. Here, we report on the design and implementation of the +first architecture capable of executing quantum network applications on quantum +processors in platform-independent high-level software. We demonstrate the +architecture's capability to execute applications in high-level software, by +implementing it as a quantum network operating system -- QNodeOS -- and +executing test programs including a delegated computation from a client to a +server on two quantum network nodes based on nitrogen-vacancy (NV) centers in +diamond. We show how our architecture allows us to maximize the use of quantum +network hardware, by multitasking different applications on a quantum network +for the first time. Our architecture can be used to execute programs on any +quantum processor platform corresponding to our system model, which we +illustrate by demonstrating an additional driver for QNodeOS for a trapped-ion +quantum network node based on a single $^{40}\text{Ca}^+$ atom. Our +architecture lays the groundwork for computer science research in the domain of +quantum network programming, and paves the way for the development of software +that can bring quantum network technology to society. + +
+
+ comment: 12 pages, 5 figures, supplementary materials (48 pages, 24 figures, + 11 tables) +
+
+
+
+
+ + ♻ ☆ VeriFence: Lightweight and Precise Spectre Defenses for Untrusted Linux + Kernel Extensions RAID'24 + + +
+ High-performance IO demands low-overhead communication between user- and +kernel space. This demand can no longer be fulfilled by traditional system +calls. Linux's extended Berkeley Packet Filter (BPF) avoids user-/kernel +transitions by just-in-time compiling user-provided bytecode and executing it +in kernel mode with near-native speed. To still isolate BPF programs from the +kernel, they are statically analyzed for memory- and type-safety, which imposes +some restrictions but allows for good expressiveness and high performance. +However, to mitigate the Spectre vulnerabilities disclosed in 2018, defenses +which reject potentially-dangerous programs had to be deployed. We find that +this affects 31% to 54% of programs in a dataset with 844 real-world BPF +programs from popular open-source projects. To solve this, users are forced to +disable the defenses to continue using the programs, which puts the entire +system at risk. + To enable secure and expressive untrusted Linux kernel extensions, we propose +VeriFence, an enhancement to the kernel's Spectre defenses that reduces the +number of BPF application programs rejected from 54% to zero. We measure +VeriFence's overhead for all mainstream performance-sensitive applications of +BPF (i.e., event tracing, profiling, and packet processing) and find that it +improves significantly upon the status-quo where affected BPF programs are +either unusable or enable transient execution attacks on the kernel. + +
+
+ comment: RAID'24 +
+
+
+
+
+
+
+
+ + Programming and Languages 3 + +
+
+
+ + ☆ Detecting and explaining (in)equivalence of context-free grammars + + +
+ We propose a scalable framework for deciding, proving, and explaining +(in)equivalence of context-free grammars. We present an implementation of the +framework and evaluate it on large data sets collected within educational +support systems. Even though the equivalence problem for context-free languages +is undecidable in general, the framework is able to handle a large portion of +these datasets. It introduces and combines techniques from several areas, such +as an abstract grammar transformation language to identify equivalent grammars +as well as sufficiently similar inequivalent grammars, theory-based comparison +algorithms for a large class of context-free languages, and a +graph-theory-inspired grammar canonization that allows to efficiently identify +isomorphic grammars. + +
+
+
+
+
+ + ☆ Compilation of Commit Changes within Java Source Code Repositories + + +
+ Java applications include third-party dependencies as bytecode. To keep these +applications secure, researchers have proposed tools to re-identify +dependencies that contain known vulnerabilities. Yet, to allow such +re-identification, one must obtain, for each vulnerability patch, the bytecode +fixing the respective vulnerability at first. Such patches for dependencies are +curated in databases in the form of fix-commits. But fixcommits are in source +code, and automatically compiling whole Java projects to bytecode is +notoriously hard, particularly for non-current versions of the code. In this +paper, we thus propose JESS, an approach that largely avoids this problem by +compiling solely the relevant code that was modified within a given commit. +JESS reduces the code, retaining only those parts that the committed change +references. To avoid name-resolution errors, JESS automatically infers stubs +for references to entities that are unavailable to the compiler. A challenge is +here that, to facilitate the above mentioned reidentification, JESS must seek +to produce bytecode that is almost identical to the bytecode which one would +obtain by a successful compilation of the full project. An evaluation on 347 +GitHub projects shows that JESS is able to compile, in isolation, 72% of +methods and constructors, of which 89% have bytecode equal to the original one. +Furthermore, on the Project KB database of fix-commits, in which only 8% of +files modified within the commits can be compiled with the provided build +scripts, JESS is able to compile 73% of all files that these commits modify. + +
+
+ comment: To be published in: ICSME 2024 Proceedings +
+
+
+
+
+ + ♻ ☆ Bluefish: A Relational Framework for Graphic Representations + + +
+ Diagrams are essential tools for problem-solving and communication as they +externalize conceptual structures using spatial relationships. But when picking +a diagramming framework, users are faced with a dilemma. They can either use a +highly expressive but low-level toolkit, whose API does not match their +domain-specific concepts, or select a high-level typology, which offers a +recognizable vocabulary but supports a limited range of diagrams. To address +this gap, we introduce Bluefish: a diagramming framework inspired by +component-based user interface (UI) libraries. Bluefish lets users create +diagrams using relations: declarative, composable, and extensible diagram +fragments that relax the concept of a UI component. Unlike a component, a +relation does not have sole ownership over its children nor does it need to +fully specify their layout. To render diagrams, Bluefish extends a traditional +tree-based scenegraph to a compound graph that captures both hierarchical and +adjacent relationships between nodes. To evaluate our system, we construct a +diverse example gallery covering many domains including mathematics, physics, +computer science, and even cooking. We show that Bluefish's relations are +effective declarative primitives for diagrams. Bluefish is open source, and we +aim to shape it into both a usable tool and a research platform. + +
+
+ comment: 27 pages, 14 figures +
+
+
+
+
+
+
+
+ + Performance Profiling 4 + +
+
+
+ + ☆ SCALE: Self-regulated Clustered federAted LEarning in a Homogeneous + Environment + + +
+ Federated Learning (FL) has emerged as a transformative approach for enabling +distributed machine learning while preserving user privacy, yet it faces +challenges like communication inefficiencies and reliance on centralized +infrastructures, leading to increased latency and costs. This paper presents a +novel FL methodology that overcomes these limitations by eliminating the +dependency on edge servers, employing a server-assisted Proximity Evaluation +for dynamic cluster formation based on data similarity, performance indices, +and geographical proximity. Our integrated approach enhances operational +efficiency and scalability through a Hybrid Decentralized Aggregation Protocol, +which merges local model training with peer-to-peer weight exchange and a +centralized final aggregation managed by a dynamically elected driver node, +significantly curtailing global communication overhead. Additionally, the +methodology includes Decentralized Driver Selection, Check-pointing to reduce +network traffic, and a Health Status Verification Mechanism for system +robustness. Validated using the breast cancer dataset, our architecture not +only demonstrates a nearly tenfold reduction in communication overhead but also +shows remarkable improvements in reducing training latency and energy +consumption while maintaining high learning performance, offering a scalable, +efficient, and privacy-preserving solution for the future of federated learning +ecosystems. + +
+
+ comment: This research article got accepted in COMPSAC conference and going to + be published to IEEE +
+
+
+
+
+ + ♻ ☆ Anatomizing Deep Learning Inference in Web Browsers + + +
+ Web applications have increasingly adopted Deep Learning (DL) through +in-browser inference, wherein DL inference performs directly within Web +browsers. The actual performance of in-browser inference and its impacts on the +quality of experience (QoE) remain unexplored, and urgently require new QoE +measurements beyond traditional ones, e.g., mainly focusing on page load time. +To bridge this gap, we make the first comprehensive performance measurement of +in-browser inference to date. Our approach proposes new metrics to measure +in-browser inference: responsiveness, smoothness, and inference accuracy. Our +extensive analysis involves 9 representative DL models across Web browsers of +50 popular PC devices and 20 mobile devices. The results reveal that in-browser +inference exhibits a substantial latency gap, averaging 16.9 times slower on +CPU and 4.9 times slower on GPU compared to native inference on PC devices. The +gap on mobile CPU and mobile GPU is 15.8 times and 7.8 times, respectively. +Furthermore, we identify contributing factors to such latency gap, including +underutilized hardware instruction sets, inherent overhead in the runtime +environment, resource contention within the browser, and inefficiencies in +software libraries and GPU abstractions. Additionally, in-browser inference +imposes significant memory demands, at times exceeding 334.6 times the size of +the DL models themselves, partly attributable to suboptimal memory management. +We also observe that in-browser inference leads to a significant 67.2% increase +in the time it takes for GUI components to render within Web browsers, +significantly affecting the overall user QoE of Web applications reliant on +this technology + +
+
+ comment: Accepted by ACM Transactions on Software Engineering and Methodology + (TOSEM) +
+
+
+
+
+ + ♻ ☆ KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache + + +
+ Efficiently serving large language models (LLMs) requires batching of many +requests to reduce the cost per request. Yet, with larger batch sizes and +longer context lengths, the key-value (KV) cache, which stores attention keys +and values to avoid re-computations, significantly increases memory demands and +becomes the new bottleneck in speed and memory usage. Additionally, the loading +of the KV cache causes the computational core to be idle, which limits the +inference speed. A straightforward and effective solution to reduce KV cache +size is quantization, which decreases the total bytes taken by KV cache. +However, there is a lack of in-depth studies that explore the element +distribution of KV cache to understand the hardness and limitation of KV cache +quantization. To fill the gap, we conducted a comprehensive study on the +element distribution in KV cache of popular LLMs. Our findings indicate that +the key cache should be quantized per-channel, i.e., group elements along the +channel dimension and quantize them together. In contrast, the value cache +should be quantized per-token. From this analysis, we developed a tuning-free +2bit KV cache quantization algorithm named KIVI. With hardware-friendly +implementation, KIVI can enable Llama, Falcon, and Mistral models to maintain +almost the same quality while using $\mathbf{2.6\times}$ less peak memory +(including model weight). This reduction in memory usage enables up to +$\mathbf{4\times}$ larger batch size, bringing $\mathbf{2.35\times \sim +3.47\times}$ throughput on real LLM inference workload. The source code is +available at https://github.com/jy-yuan/KIVI. + +
+
+ comment: ICML2024 +
+
+
+
+
+ + ♻ ☆ How to Rent GPUs on a Budget + + +
+ The explosion in Machine Learning (ML) over the past ten years has led to a +dramatic increase in demand for GPUs to train ML models. Because it is +prohibitively expensive for most users to build and maintain a large GPU +cluster, large cloud providers (Microsoft Azure, Amazon AWS, Google Cloud) have +seen explosive growth in demand for renting cloud-based GPUs. In this +cloud-computing paradigm, a user must specify their demand for GPUs at every +moment in time, and will pay for every GPU-hour they use. ML training jobs are +known to be parallelizable to different degrees. Given a stream of ML training +jobs, a user typically wants to minimize the mean response time across all +jobs. Here, the response time of a job denotes the time from when a job arrives +until it is complete. Additionally, the user is constrained by some operating +budget. Specifically, in this paper the user is constrained to use no more than +$b$ GPUs per hour, over a long-run time average. The question is how to +minimize mean response time while meeting the budget constraint. Because +training jobs receive a diminishing marginal benefit from running on additional +GPUs, allocating too many GPUs to a single training job can dramatically +increase the overall cost paid by the user. Hence, an optimal rental policy +must balance a tradeoff between training cost and mean response time. This +paper derives the optimal rental policy for a stream of training jobs where the +jobs have different levels of parallelizability (specified by a speedup +function) and different job sizes (amounts of inherent work). We make almost no +assumptions about the arrival process and about the job size distribution. Our +optimal policy specifies how many GPUs to rent at every moment in time and how +to allocate these GPUs. + +
+
+
+
+
+
+
+
+ + Computational Complexity 4 + +
+
+
+ + ☆ Semi-Classical Subspaces, The No Synchronization Law, and More + + +
+ This paper looks at the intersection of algorithmic information theory and +physics, namely quantum mechanics, thermodynamics, and black holes. We discuss +theorems which characterize the barrier between the quantum world and the +classical realm. The notion of a "semi-classical subspace" is introduced. The +No Synchronization Law is detailed, which says separate and isolated physical +systems evolving over time cannot have thermodynamic algorithmic entropies that +are in synch. We look at future work involving the Kolmogorov complexity of +black holes. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2402.13049 +
+
+
+
+
+ + ☆ The Existential Theory of the Reals as a Complexity Class: A Compendium + + +
+ We survey the complexity class $\exists \mathbb{R}$, which captures the +complexity of deciding the existential theory of the reals. The class $\exists +\mathbb{R}$ has roots in two different traditions, one based on the +Blum-Shub-Smale model of real computation, and the other following work by +Mn\"{e}v and Shor on the universality of realization spaces of oriented +matroids. Over the years the number of problems for which $\exists \mathbb{R}$ +rather than NP has turned out to be the proper way of measuring their +complexity has grown, particularly in the fields of computational geometry, +graph drawing, game theory, and some areas in logic and algebra. $\exists +\mathbb{R}$ has also started appearing in the context of machine learning, +Markov decision processes, and probabilistic reasoning. + We have aimed at collecting a comprehensive compendium of problems complete +and hard for $\exists \mathbb{R}$, as well as a long list of open problems. The +compendium is presented in the third part of our survey; a tour through the +compendium and the areas it touches on makes up the second part. The first part +introduces the reader to the existential theory of the reals as a complexity +class, discussing its history, motivation and prospects as well as some +technical aspects. + +
+
+ comment: 126 pages, 12 figures, 6 tables, about 150 complete problems and + about 50 open problems +
+
+
+
+
+ + ☆ Supercritical Size-Width Tree-Like Resolution Trade-Offs for Graph + Isomorphism + + +
+ We study the refutation complexity of graph isomorphism in the tree-like +resolution calculus. Tor\'an and W\"orz (TOCL 2023) showed that there is a +resolution refutation of narrow width $k$ for two graphs if and only if they +can be distinguished in ($k+1$)-variable first-order logic (FO$^{k+1}$) and +hence by a count-free variant of the $k$-dimensional Weisfeiler-Leman +algorithm. While DAG-like narrow width $k$ resolution refutations have size at +most $n^k$, tree-like refutations may be much larger. We show that there are +graphs of order n, whose isomorphism can be refuted in narrow width $k$ but +only in tree-like size $2^{\Omega(n^{k/2})}$. This is a supercritical trade-off +where bounding one parameter (the narrow width) causes the other parameter (the +size) to grow above its worst case. The size lower bound is super-exponential +in the formula size and improves a related supercritical width versus tree-like +size trade-off by Razborov (JACM 2016). To prove our result, we develop a new +variant of the $k$-pebble EF-game for FO$^k$ to reason about tree-like +refutation size in a similar way as the Prover-Delayer games in proof +complexity. We analyze this game on a modified variant of the compressed CFI +graphs introduced by Grohe, Lichter, Neuen, and Schweitzer (FOCS 2023). Using a +recent improved robust compressed CFI construction of Janett, Nordstr\"om, and +Pang (unpublished manuscript), we obtain a similar bound for width $k$ (instead +of the stronger but less common narrow width) and make the result more robust. + +
+
+ comment: 32 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ On SAT information content, its polynomial-time solvability and fixed + code algorithms + + +
+ The amount of information in satisfiability problem (SAT) is considered. SAT +can be polynomial-time solvable when the solving algorithm holds an exponential +amount of information. It is also established that SAT Kolmogorov complexity is +constant. It is argued that the amount of information in SAT grows at least +exponentially with the size of the input instance. The amount of information in +SAT is compared with the amount of information in the fixed code algorithms and +generated over runtime. + +
+
+ comment: 16 pages, 1 table, 0 figures, new content, rewriting arguments, + corrected typos +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 7 + +
+
+
+ + ☆ Detecting and explaining (in)equivalence of context-free grammars + + +
+ We propose a scalable framework for deciding, proving, and explaining +(in)equivalence of context-free grammars. We present an implementation of the +framework and evaluate it on large data sets collected within educational +support systems. Even though the equivalence problem for context-free languages +is undecidable in general, the framework is able to handle a large portion of +these datasets. It introduces and combines techniques from several areas, such +as an abstract grammar transformation language to identify equivalent grammars +as well as sufficiently similar inequivalent grammars, theory-based comparison +algorithms for a large class of context-free languages, and a +graph-theory-inspired grammar canonization that allows to efficiently identify +isomorphic grammars. + +
+
+
+
+
+ + ☆ Reachability for Multi-Priced Timed Automata with Positive and Negative + Rates + + +
+ Multi-priced timed automata (MPTA) are timed automata with observer + variables whose derivatives can change from one location to another. + Observers are write-only variables, that is, they do not affect the control + flow of the automaton; thus MPTA lie between timed and hybrid + automata in expressiveness. Previous work considered observers with + non-negative slope in every location. In this paper we treat + observers that have both positive and negative rates. Our + main result is an algorithm to decide a gap version of the + reachability problem for this variant of MPTA. We translate the + gap reachability problem into a gap satisfiability problem for mixed + integer-real systems of nonlinear constraints. Our main technical + contribution -- a result of independent interest -- is a procedure + to solve such contraints via a combination of branch-and-bound + and relaxation-and-rounding. + +
+
+
+
+
+ + ☆ On the Minimisation of Deterministic and History-Deterministic + Generalised (co)Büchi Automata + + +
+ We present a polynomial-time algorithm minimising the number of states of +history-deterministic generalised coB\"uchi automata, building on the work of +Abu Radi and Kupferman on coB\"uchi automata. On the other hand, we establish +that the minimisation problem for both deterministic and history-deterministic +generalised B\"uchi automata is NP-complete, as well as the problem of +minimising at the same time the number of states and colours of +history-deterministic generalised coB\"uchi automata. + +
+
+
+
+
+ + ☆ The Existential Theory of the Reals as a Complexity Class: A Compendium + + +
+ We survey the complexity class $\exists \mathbb{R}$, which captures the +complexity of deciding the existential theory of the reals. The class $\exists +\mathbb{R}$ has roots in two different traditions, one based on the +Blum-Shub-Smale model of real computation, and the other following work by +Mn\"{e}v and Shor on the universality of realization spaces of oriented +matroids. Over the years the number of problems for which $\exists \mathbb{R}$ +rather than NP has turned out to be the proper way of measuring their +complexity has grown, particularly in the fields of computational geometry, +graph drawing, game theory, and some areas in logic and algebra. $\exists +\mathbb{R}$ has also started appearing in the context of machine learning, +Markov decision processes, and probabilistic reasoning. + We have aimed at collecting a comprehensive compendium of problems complete +and hard for $\exists \mathbb{R}$, as well as a long list of open problems. The +compendium is presented in the third part of our survey; a tour through the +compendium and the areas it touches on makes up the second part. The first part +introduces the reader to the existential theory of the reals as a complexity +class, discussing its history, motivation and prospects as well as some +technical aspects. + +
+
+ comment: 126 pages, 12 figures, 6 tables, about 150 complete problems and + about 50 open problems +
+
+
+
+
+ + ♻ ☆ History-deterministic Timed Automata + + +
+ We explore the notion of history-determinism in the context of timed automata +(TA) over infinite timed words. History-deterministic (HD) automata are those +in which nondeterminism can be resolved on the fly, based on the run +constructed thus far. History-determinism is a robust property that admits +different game-based characterisations, and HD specifications allow for +game-based verification without an expensive determinization step. + We show that the class of timed $\omega$-languages recognized by HD timed +automata strictly extends that of deterministic ones, and is strictly included +in those recognised by fully non-deterministic TA. + For non-deterministic timed automata it is known that universality is already +undecidable for safety/reachability TA. For history-deterministic TA with +arbitrary parity acceptance, we show that timed universality, inclusion, and +synthesis all remain decidable and are EXPTIME-complete. + For the subclass of TA with safety or reachability acceptance, one can decide +(in EXPTIME) whether such an automaton is history-deterministic. If so, it can +effectively determinized without introducing new automata states. + +
+
+
+
+
+ + ♻ ☆ Model-bounded monitoring of hybrid systems + + +
+ Monitoring of hybrid systems attracts both scientific and practical +attention. However, monitoring algorithms suffer from the methodological +difficulty of only observing sampled discrete-time signals, while real +behaviors are continuous-time signals. To mitigate this problem of sampling +uncertainties, we introduce a model-bounded monitoring scheme, where we use +prior knowledge about the target system to prune interpolation candidates. +Technically, we express such prior knowledge by linear hybrid automata (LHAs) +-- the LHAs are called bounding models. We introduce a novel notion of +monitored language of LHAs, and we reduce the monitoring problem to the +membership problem of the monitored language. We present two partial algorithms +-- one is via reduction to reachability in LHAs and the other is a direct one +using polyhedra -- and show that these methods, and thus the proposed +model-bounded monitoring scheme, are efficient and practically relevant. + +
+
+ comment: This is the author version of the manuscript of the same name + published in the ACM Transactions on Cyber-Physical Systems +
+
+
+
+
+ + ♻ ☆ Parametric Timed Pattern Matching CCS 2018 + + +
+ Given a log and a specification, timed pattern matching aims at exhibiting +for which start and end dates a specification holds on that log. For example, +"a given action is always followed by another action before a given deadline". +This problem has strong connections with monitoring real-time systems. We +address here timed pattern matching in the presence of an uncertain +specification, i.e., that may contain timing parameters (e.g., the deadline can +be uncertain or unknown). We want to know for which start and end dates, and +for what values of the timing parameters, a property holds. For instance, we +look for the minimum or maximum deadline (together with the corresponding start +and end dates) for which the property holds. We propose two frameworks for +parametric timed pattern matching. The first one is based on parametric timed +model checking. In contrast to most parametric timed problems, the solution is +effectively computable. The second one is a dedicated method; not only we +largely improve the efficiency compared to the first method, but we further +propose optimizations with skipping. Our experiment results suggest that our +algorithms, especially the second one, are efficient and practically relevant. + +
+
+ comment: This is the author version of the manuscript of the same name + published in ACM Transactions on Software Engineering and Methodology (Volume + 32, Issue 1, 2023). This manuscript is an extension of [ICECCS 2018, NFM + 2019], with [ICECCS 2018] describing the first method of this manuscript + (based on parametric timed model checking) while [NFM 2019] describes the + second dedicated method. arXiv admin note: substantial text overlap with + arXiv:1812.08940 +
+
+
+
+
+
+
+
+ + Logic in Computer Science 9 + +
+
+
+ + ☆ C2P: Featuring Large Language Models with Causal Reasoning + + +
+ Causal reasoning is the primary bottleneck that Large Language Models (LLMs) +must overcome to attain human-level intelligence. To address this, we introduce +the Causal Chain of Prompting (C2P) as the first reasoning framework that +equips current LLMs with causal reasoning capabilities. C2P operates +autonomously, avoiding reliance on external tools or modules during both the +causal learning and reasoning phases, and can be seamlessly implemented during +the training or fine-tuning of LLMs. Experimental results across various +benchmark datasets demonstrate a significant improvement in causal learning and +subsequent reasoning accuracy of LLMs. We illustrate how C2P enhances LLMs' +ability to causally reason in real-world scenarios, addressing complex problems +in fields such as healthcare, medicine, economics, education, social sciences, +environmental science, and marketing. With few-shot learning, GPT-4 Turbo using +C2P with as few as six examples achieves significant performance improvements, +boasting over a 33% increase in reasoning accuracy over the most +state-of-the-art LLMs, which perform nearly randomly in similar circumstances. +This demonstrates the transformative potential of integrating C2P into LLM +training or fine-tuning processes, thereby empowering these models with +advanced causal reasoning capabilities. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2306.05836 by other authors +
+
+
+
+
+ + ☆ The Existential Theory of the Reals as a Complexity Class: A Compendium + + +
+ We survey the complexity class $\exists \mathbb{R}$, which captures the +complexity of deciding the existential theory of the reals. The class $\exists +\mathbb{R}$ has roots in two different traditions, one based on the +Blum-Shub-Smale model of real computation, and the other following work by +Mn\"{e}v and Shor on the universality of realization spaces of oriented +matroids. Over the years the number of problems for which $\exists \mathbb{R}$ +rather than NP has turned out to be the proper way of measuring their +complexity has grown, particularly in the fields of computational geometry, +graph drawing, game theory, and some areas in logic and algebra. $\exists +\mathbb{R}$ has also started appearing in the context of machine learning, +Markov decision processes, and probabilistic reasoning. + We have aimed at collecting a comprehensive compendium of problems complete +and hard for $\exists \mathbb{R}$, as well as a long list of open problems. The +compendium is presented in the third part of our survey; a tour through the +compendium and the areas it touches on makes up the second part. The first part +introduces the reader to the existential theory of the reals as a complexity +class, discussing its history, motivation and prospects as well as some +technical aspects. + +
+
+ comment: 126 pages, 12 figures, 6 tables, about 150 complete problems and + about 50 open problems +
+
+
+
+
+ + ☆ Pruning Boolean d-DNNF Circuits Through Tseitin-Awareness + + +
+ Boolean circuits in d-DNNF form enable tractable probabilistic inference. +However, as a key insight of this work, we show that commonly used d-DNNF +compilation approaches introduce irrelevant subcircuits. We call these +subcircuits Tseitin artifacts, as they are introduced due to the Tseitin +transformation step -- a well-established procedure to transform any circuit +into the CNF format required by several d-DNNF knowledge compilers. We discuss +how to detect and remove both Tseitin variables and Tseitin artifacts, leading +to more succinct circuits. We empirically observe an average size reduction of +77.5% when removing both Tseitin variables and artifacts. The additional +pruning of Tseitin artifacts reduces the size by 22.2% on average. This +significantly improves downstream tasks that benefit from a more succinct +circuit, e.g., probabilistic inference tasks. + +
+
+ comment: submitted to ICTAI 2024 +
+
+
+
+
+ + ☆ Supercritical Size-Width Tree-Like Resolution Trade-Offs for Graph + Isomorphism + + +
+ We study the refutation complexity of graph isomorphism in the tree-like +resolution calculus. Tor\'an and W\"orz (TOCL 2023) showed that there is a +resolution refutation of narrow width $k$ for two graphs if and only if they +can be distinguished in ($k+1$)-variable first-order logic (FO$^{k+1}$) and +hence by a count-free variant of the $k$-dimensional Weisfeiler-Leman +algorithm. While DAG-like narrow width $k$ resolution refutations have size at +most $n^k$, tree-like refutations may be much larger. We show that there are +graphs of order n, whose isomorphism can be refuted in narrow width $k$ but +only in tree-like size $2^{\Omega(n^{k/2})}$. This is a supercritical trade-off +where bounding one parameter (the narrow width) causes the other parameter (the +size) to grow above its worst case. The size lower bound is super-exponential +in the formula size and improves a related supercritical width versus tree-like +size trade-off by Razborov (JACM 2016). To prove our result, we develop a new +variant of the $k$-pebble EF-game for FO$^k$ to reason about tree-like +refutation size in a similar way as the Prover-Delayer games in proof +complexity. We analyze this game on a modified variant of the compressed CFI +graphs introduced by Grohe, Lichter, Neuen, and Schweitzer (FOCS 2023). Using a +recent improved robust compressed CFI construction of Janett, Nordstr\"om, and +Pang (unpublished manuscript), we obtain a similar bound for width $k$ (instead +of the stronger but less common narrow width) and make the result more robust. + +
+
+ comment: 32 pages, 2 figures +
+
+
+
+
+ + ☆ On Polynomial-Time Decidability of k-Negations Fragments of First-Order + Theories + + +
+ This paper introduces a generic framework that provides sufficient conditions +for guaranteeing polynomial-time decidability of fixed-negation fragments of +first-order theories that adhere to certain fixed-parameter tractability +requirements. It enables deciding sentences of such theories with arbitrary +existential quantification, conjunction and a fixed number of negation symbols +in polynomial time. It was recently shown by Nguyen and Pak [SIAM J. Comput. +51(2): 1--31 (2022)] that an even more restricted such fragment of Presburger +arithmetic (the first-order theory of the integers with addition and order) is +NP-hard. In contrast, by application of our framework, we show that the fixed +negation fragment of weak Presburger arithmetic, which drops the order relation +from Presburger arithmetic in favour of equality, is decidable in polynomial +time. + +
+
+
+
+
+ + ♻ ☆ History-deterministic Timed Automata + + +
+ We explore the notion of history-determinism in the context of timed automata +(TA) over infinite timed words. History-deterministic (HD) automata are those +in which nondeterminism can be resolved on the fly, based on the run +constructed thus far. History-determinism is a robust property that admits +different game-based characterisations, and HD specifications allow for +game-based verification without an expensive determinization step. + We show that the class of timed $\omega$-languages recognized by HD timed +automata strictly extends that of deterministic ones, and is strictly included +in those recognised by fully non-deterministic TA. + For non-deterministic timed automata it is known that universality is already +undecidable for safety/reachability TA. For history-deterministic TA with +arbitrary parity acceptance, we show that timed universality, inclusion, and +synthesis all remain decidable and are EXPTIME-complete. + For the subclass of TA with safety or reachability acceptance, one can decide +(in EXPTIME) whether such an automaton is history-deterministic. If so, it can +effectively determinized without introducing new automata states. + +
+
+
+
+
+ + ♻ ☆ One is all you need: Second-order Unification without First-order + Variables + + +
+ We introduce a fragment of second-order unification, referred to as +\emph{Second-Order Ground Unification (SOGU)}, with the following properties: +(i) only one second-order variable is allowed, and (ii) first-order variables +do not occur. We study an equational variant of SOGU where the signature +contains \textit{associative} binary function symbols (ASOGU) and show that +Hilbert's 10$^{th}$ problem is reducible to ASOGU unifiability, thus proving +undecidability. Our reduction provides a new lower bound for the undecidability +of second-order unification, as previous results required first-order variable +occurrences, multiple second-order variables, and/or equational theories +involving \textit{length-reducing} rewrite systems. Furthermore, our reduction +holds even in the case when associativity of the binary function symbol is +restricted to \emph{power associative}, i.e. f(f(x,x),x)= f(x,f(x,x)), as our +construction requires a single constant. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Model-bounded monitoring of hybrid systems + + +
+ Monitoring of hybrid systems attracts both scientific and practical +attention. However, monitoring algorithms suffer from the methodological +difficulty of only observing sampled discrete-time signals, while real +behaviors are continuous-time signals. To mitigate this problem of sampling +uncertainties, we introduce a model-bounded monitoring scheme, where we use +prior knowledge about the target system to prune interpolation candidates. +Technically, we express such prior knowledge by linear hybrid automata (LHAs) +-- the LHAs are called bounding models. We introduce a novel notion of +monitored language of LHAs, and we reduce the monitoring problem to the +membership problem of the monitored language. We present two partial algorithms +-- one is via reduction to reachability in LHAs and the other is a direct one +using polyhedra -- and show that these methods, and thus the proposed +model-bounded monitoring scheme, are efficient and practically relevant. + +
+
+ comment: This is the author version of the manuscript of the same name + published in the ACM Transactions on Cyber-Physical Systems +
+
+
+
+
+ + ♻ ☆ Zeta Functions and the (Linear) Logic of Markov Processes + + +
+ The author introduced models of linear logic known as ''Interaction Graphs'' +which generalise Girard's various geometry of interaction constructions. In +this work, we establish how these models essentially rely on a deep connection +between zeta functions and the execution of programs, expressed as a cocycle. +This is first shown in the simple case of graphs, before begin lifted to +dynamical systems. Focussing on probabilistic models, we then explain how the +notion of graphings used in Interaction Graphs captures a natural class of +sub-Markov processes. We then extend the realisability constructions and the +notion of zeta function to provide a realisability model of second-order linear +logic over the set of all (discrete-time) sub-Markov processes. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Logic in Computer Science 8 + +
+
+
+ + ☆ Static and Dynamic Verification of OCaml Programs: The Gospel Ecosystem + (Extended Version) + + +
+ We present our work on the collaborative use of dynamic and static analysis +tools for the verification of software written in the OCaml language. We build +upon Gospel, a specification language for OCaml that can be used both in +dynamic and static analyses. We employ Ortac, for runtime assertion checking, +and Cameleer and CFML for the deductive verification of OCaml code. We report +on the use of such tools to build a case study of collaborative analysis of a +non-trivial OCaml program. This shows how these tools nicely complement each +others, while at the same highlights the differences when writing specification +targeting dynamic or static analysis methods. + +
+
+
+
+
+ + ☆ Formalizing UML State Machines for Automated Verification -- A Survey + + +
+ The Unified Modeling Language (UML) is a standard for modeling dynamic +systems. UML behavioral state machines are used for modeling the dynamic +behavior of object-oriented designs. The UML specification, maintained by the +Object Management Group (OMG), is documented in natural language (in contrast +to formal language). The inherent ambiguity of natural languages may introduce +inconsistencies in the resulting state machine model. Formalizing UML state +machine specification aims at solving the ambiguity problem and at providing a +uniform view to software designers and developers. Such a formalization also +aims at providing a foundation for automatic verification of UML state machine +models, which can help to find software design vulnerabilities at an early +stage and reduce the development cost. We provide here a comprehensive survey +of existing work from 1997 to 2021 related to formalizing UML state machine +semantics for the purpose of conducting model checking at the design stage. + +
+
+ comment: This is the author version of the manuscript of the same name + published in ACM Computing Surveys +
+
+
+
+
+ + ☆ A quantitative probabilistic relational Hoare logic + + +
+ We introduce eRHL, a program logic for reasoning about relational expectation +properties of pairs of probabilistic programs. eRHL is quantitative, i.e., its +pre- and post-conditions take values in the extended non-negative reals. Thanks +to its quantitative assertions, eRHL overcomes randomness alignment +restrictions from prior logics, including PRHL, a popular relational program +logic used to reason about security of cryptographic constructions, and apRHL, +a variant of PRHL for differential privacy. As a result, eRHL is the first +relational probabilistic program logic to be supported by non-trivial soundness +and completeness results for all almost surely terminating programs. We show +that eRHL is sound and complete with respect to program equivalence, +statistical distance, and differential privacy. We also show that every PRHL +judgment is valid iff it is provable in eRHL. We showcase the practical +benefits of eRHL with examples that are beyond reach of PRHL and apRHL. + +
+
+
+
+
+ + ☆ A process algebraic framework for multi-agent dynamic epistemic systems + + +
+ This paper combines the classical model of labeled transition systems with +the epistemic model for reasoning about knowledge. The result is a unifying +framework for modeling and analyzing multi-agent, knowledge-based, dynamic +systems. On the modeling side, we propose a process algebraic, agent-oriented +specification language that makes such a framework easy to use for practical +purposes. On the verification side, we define a modal logic encompassing +temporal and epistemic operators. + +
+
+
+
+
+ + ♻ ☆ Goedel logics: Prenex fragments + + +
+ In this paper, we provide a complete classification for the first-order +Goedel logics concerning the property that the formulas admit logically +equivalent prenex normal forms. We show that the only first-order Goedel logics +that admit such prenex forms are those with finite truth value sets since they +allow all quantifier-shift rules and the logic $G_\uparrow$ with only one +accumulation point at 1 in the infinite truth value set. In all the other +cases, there are generally no logically equivalent prenex normal forms. We will +also see that $G_\uparrow$ is the intersection of all finite first-order Goedel +logics. + The second part of this paper investigates the existence of effective +equivalence between the validity of a formula and the validity of some prenex +normal form. The existence of such a normal form is obvious for finite valued +Goedel logic and $G_\uparrow$. Goedel logics with an uncountable truth value +set admit the prenex normal forms if and only if every surrounding of 0 is +uncountable or 0 is an isolated point. Otherwise, uncountable Goedel logics are +not recursively enumerable, however, the prenex fragment is always recursively +enumerable. Therefore, there is no effective translation between the valid +formula and the valid prenex normal form. However, the existence of effectively +constructible validity equivalent prenex forms for the countable case is still +up for debate. + +
+
+ comment: Research supported by FWF grant P 36571 +
+
+
+
+
+ + ♻ ☆ Representing Sugihara monoids via weakening relations + + +
+ We show that all Sugihara monoids can be represented as algebras of binary +relations, with the monoid operation given by relational composition. Moreover, +the binary relations are weakening relations. The first step is to obtain an +explicit relational representation of all finite odd Sugihara chains. Our +construction mimics that of Maddux (2010), where a relational representation of +the finite even Sugihara chains is given. We define the class of representable +Sugihara monoids as those which can be represented as reducts of distributive +involutive FL-algebras of binary relations. We then show that the class of +representable distributive involutive FL-algebras is closed under +ultraproducts. This fact is used to demonstrate that the two infinite Sugihara +monoids that generate the quasivariety are also representable. From this it +follows that all Sugihara monoids are representable. + +
+
+ comment: 29 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Worst-Case Input Generation for Concurrent Programs under Non-Monotone + Resource Metrics + + +
+ Worst-case input generation aims to automatically generate inputs that +exhibit the worst-case performance of programs. It has several applications, +and can, for example, detect vulnerabilities to denial-of-service attacks. +However, it is non-trivial to generate worst-case inputs for concurrent +programs, particularly for resources like memory where the peak cost depends on +how processes are scheduled. + This article presents the first sound worst-case input generation algorithm +for concurrent programs under non-monotone resource metrics like memory. The +key insight is to leverage resource-annotated session types and symbolic +execution. Session types describe communication protocols on channels in +process calculi. Equipped with resource annotations, resource-annotated session +types not only encode cost bounds but also indicate how many resources can be +reused and transferred between processes. This information is critical for +identifying a worst-case execution path during symbolic execution. The +algorithm is sound: if it returns any input, it is guaranteed to be a valid +worst-case input. The algorithm is also relatively complete: as long as +resource-annotated session types are sufficiently expressive and the background +theory for SMT solving is decidable, a worst-case input is guaranteed to be +returned. A simple case study of a web server's memory usage demonstrates the +utility of the worst-case input generation algorithm. + +
+
+
+
+
+ + ♻ ☆ Higher Order Automatic Differentiation of Higher Order Functions + + +
+ We present semantic correctness proofs of automatic differentiation (AD). We +consider a forward-mode AD method on a higher order language with algebraic +data types, and we characterise it as the unique structure preserving macro +given a choice of derivatives for basic operations. We describe a rich +semantics for differentiable programming, based on diffeological spaces. We +show that it interprets our language, and we phrase what it means for the AD +method to be correct with respect to this semantics. We show that our +characterisation of AD gives rise to an elegant semantic proof of its +correctness based on a gluing construction on diffeological spaces. We explain +how this is, in essence, a logical relations argument. Throughout, we show how +the analysis extends to AD methods for computing higher order derivatives using +a Taylor approximation. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2001.02209 +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 13 + +
+
+
+ + ☆ Tutorial: Object as a Service (OaaS) Serverless Cloud Computing Paradigm + + +
+ While the first generation of cloud computing systems mitigated the job of +system administrators, the next generation of cloud computing systems is +emerging to mitigate the burden for cloud developers -- facilitating the +development of cloud-native applications. This paradigm shift is primarily +happening by offering higher-level serverless abstractions, such as Function as +a Service (FaaS). Although FaaS has successfully abstracted developers from the +cloud resource management details, it falls short in abstracting the management +of both data (i.e., state) and the non-functional aspects, such as Quality of +Service (QoS) requirements. The lack of such abstractions implies developer +intervention and is counterproductive to the objective of mitigating the burden +of cloud-native application development. To further streamline cloud-native +application development, we present Object-as-a-Service (OaaS) -- a serverless +paradigm that borrows the object-oriented programming concepts to encapsulate +application logic and data in addition to non-functional requirements into a +single deployment package, thereby streamlining provider-agnostic cloud-native +application development. We realized the OaaS paradigm through the development +of an open-source platform called Oparaca. In this tutorial, we will present +the concept and design of the OaaS paradigm and its implementation -- the +Oparaca platform. Then, we give a tutorial on developing and deploying the +application on the Oparaca platform and discuss its benefits and its optimal +configurations to avoid potential overheads. + +
+
+
+
+
+ + ☆ Noise-Aware Distributed Quantum Approximate Optimization Algorithm on + Near-term Quantum Hardware + + +
+ This paper introduces a noise-aware distributed Quantum Approximate +Optimization Algorithm (QAOA) tailored for execution on near-term quantum +hardware. Leveraging a distributed framework, we address the limitations of +current Noisy Intermediate-Scale Quantum (NISQ) devices, which are hindered by +limited qubit counts and high error rates. Our approach decomposes large QAOA +problems into smaller subproblems, distributing them across multiple Quantum +Processing Units (QPUs) to enhance scalability and performance. The noise-aware +strategy incorporates error mitigation techniques to optimize qubit fidelity +and gate operations, ensuring reliable quantum computations. We evaluate the +efficacy of our framework using the HamilToniQ Benchmarking Toolkit, which +quantifies the performance across various quantum hardware configurations. The +results demonstrate that our distributed QAOA framework achieves significant +improvements in computational speed and accuracy, showcasing its potential to +solve complex optimization problems efficiently in the NISQ era. This work sets +the stage for advanced algorithmic strategies and practical quantum system +enhancements, contributing to the broader goal of achieving quantum advantage. + +
+
+
+
+
+ + ☆ Lossy Data Compression By Adaptive Mesh Coarsening + + +
+ Today's scientific simulations, for example in the high-performance exascale +sector, produce huge amounts of data. Due to limited I/O bandwidth and +available storage space, there is the necessity to reduce scientific data of +high performance computing applications. Error-bounded lossy compression has +been proven to be an effective approach tackling the trade-off between accuracy +and storage space. Within this work, we are exploring and discussing +error-bounded lossy compression solely based on adaptive mesh refinement +techniques. This compression technique is not only easily integrated into +existing adaptive mesh refinement applications but also suits as a general +lossy compression approach for arbitrary data in form of multi-dimensional +arrays, irrespective of the data type. Moreover, these techniques permit the +exclusion of regions of interest and even allows for nested error domains +during the compression. The described data compression technique is presented +exemplary on ERA5 data. + +
+
+
+
+
+ + ☆ Edge-Cloud Continuum Orchestration of Critical Services: A Smart-City + Approach + + +
+ Smart-city services are typically developed as closed systems within each +city's vertical, communicating and interacting with cloud services while +remaining isolated within each provider's domain. With the emergence of 5G +private domains and the introduction of new M2M services focusing on autonomous +systems, there is a shift from the cloud-based approach to a distributed edge +computing paradigm, in a \textit{continuum} orchestration. However, an +essential component is missing. Current orchestration tools, designed for +cloud-based deployments, lack robust workload isolation, fail to meet timing +constraints, and are not tailored to the resource-constrained nature of edge +devices. Therefore, new orchestration methods are needed to support MEC +environments. The work presented in this paper addresses this gap. Based on the +real needs of a smart-city testbed - the Aveiro Living Lab-, we developed a set +of orchestration components to facilitate the seamless orchestration of both +cloud and edge-based services, encompassing both critical and non-critical +services. This work extends the current Kubernetes orchestration platform to +include a novel location-specific resource definition, a custom scheduler to +accommodate real-time and legacy services, continuous service monitoring to +detect sub-optimal states, and a refined load balancing mechanism that +prioritizes the fastest response times. + +
+
+
+
+
+ + ☆ Software Defined Vehicles for Development of Deterministic Services + + +
+ With modern vehicles evolving with more features, services, complex systems, +with more sensors, actuators, and processing units, it is essential to think +about vehicles not only as means of transportation that may tend towards full +autonomy, but also as adaptive objects, that suit themselves to the needs of +occupants. Vehicular services can be developed to support these adaptations. +However, the increasing complexity of vehicular service development, even with +current standardizations and best practices and guidelines, are insufficient to +tackle the high complexity of development, with expectations of up to 1 (U.S.) +billion lines of code for a fully (level 5) autonomous vehicle. Within this +survey, the paradigm of Deterministic Software Defined Vehicles is explored +towards increasing the quality and easiness of the development of services for +automotive. Towards this, a proposed vision with four pillars is also provided: +the deterministic network configurator, the data layer configurator, and the +hypervisor configurator and the vehicle abstraction layer, all coordinated by a +software orchestrator. + +
+
+
+
+
+ + ☆ Bridging Trust into the Blockchain: A Systematic Review on On-Chain + Identity + + +
+ The ongoing regulation of blockchain-based services and applications requires +the identification of users who are issuing transactions on the blockchain. +This systematic review explores the current status, identifies research gaps, +and outlines future research directions for establishing trusted and +privacy-compliant identities on the blockchain (on-chain identity). A +systematic search term was applied across various scientific databases, +collecting 2232 potentially relevant research papers. These papers were +narrowed down in two methodologically executed steps to 98 and finally to 13 +relevant sources. The relevant articles were then systematically analyzed based +on a set of screening questions. The results of the selected studies have +provided insightful findings on the mechanisms of on-chain identities. On-chain +identities are established using zero-knowledge proofs, public key +infrastructure/certificates, and web of trust approaches. The technologies and +architectures used by the authors are also highlighted. Trust has emerged as a +key research gap, manifesting in two ways: firstly, a gap in how to trust the +digital identity representation of a physical human; secondly, a gap in how to +trust identity providers that issue identity confirmations on-chain. Potential +future research avenues are suggested to help fill the current gaps in +establishing trust and on-chain identities. + +
+
+
+
+
+ + ☆ PARS3: Parallel Sparse Skew-Symmetric Matrix-Vector Multiplication with + Reverse Cuthill-McKee Reordering + + +
+ Sparse matrices, as prevalent primitive of various scientific computing +algorithms, persist as a bottleneck in processing. A skew-symmetric matrix +flips signs of symmetric pairs in a symmetric matrix. Our work, Parallel 3-Way +Banded Skew-Symmetric Sparse Matrix-Vector Multiplication, equally improves +parallel symmetric SpMV kernels with a different perspective than the common +literature trends, by manipulating the form of matrix in a preprocessing step +to accelerate the repeated computations of iterative solvers. We effectively +use Reverse Cuthill-McKee (RCM) reordering algorithm to transform a sparse +skew-symmetrix matrix into a band matrix, then efficiently parallelize it by +splitting the band structure into 3 different parts by considering its local +sparsity. Our proposed method with RCM is novel in the sense that it is the +first implementation of parallel skew-symmetric SpMV kernels. Our enhancements +in SpMV and findings are valuable with significant strong scalings of up to 19x +over the serial compressed SpMV implementation. We overperform a +heuristic-based graph-coloring approach with synchronization phases in +implementing parallel symmetric SpMVs. Our approach also naturally applies to +parallel sparse symmetric SpMVs, that can inspire widespread SpMV solutions to +adapt presented optimizations in this paper. + +
+
+
+
+
+ + ☆ SFPrompt: Communication-Efficient Split Federated Fine-Tuning for Large + Pre-Trained Models over Resource-Limited Devices + + +
+ Large pre-trained models have exhibited remarkable achievements across +various domains. The substantial training costs associated with these models +have led to wide studies of fine-tuning for effectively harnessing their +capabilities in solving downstream tasks. Yet, conventional fine-tuning +approaches become infeasible when the model lacks access to downstream data due +to privacy concerns. Naively integrating fine-tuning approaches with the +emerging federated learning frameworks incurs substantial communication +overhead and exerts high demand on local computing resources, making it +impractical for common resource-limited devices. In this paper, we introduce +SFPrompt, an innovative privacy-preserving fine-tuning method tailored for the +federated setting where direct uploading of raw data is prohibited and local +devices are resource-constrained to run a complete pre-trained model. In +essence, SFPrompt judiciously combines split learning with federated learning +to handle these challenges. Specifically, the pre-trained model is first +partitioned into client and server components, thereby streamlining the +client-side model and substantially alleviating computational demands on local +resources. SFPrompt then introduces soft prompts into the federated model to +enhance the fine-tuning performance. To further reduce communication costs, a +novel dataset pruning algorithm and a local-loss update strategy are devised +during the fine-tuning process. Extensive experiments demonstrate that SFPrompt +delivers competitive performance as the federated full fine-tuning approach +while consuming a mere 0.46% of local computing resources and incurring 53% +less communication cost. + +
+
+
+
+
+ + ♻ ☆ Proof-of-Collaborative-Learning: A Multi-winner Federated Learning + Consensus Algorithm + + +
+ Regardless of their variations, blockchains require a consensus mechanism to +validate transactions, supervise added blocks, maintain network security, +synchronize the network state, and distribute incentives. Proof-of-Work (PoW), +one of the most influential implementations of consensus mechanisms, consumes +an extraordinary amount of energy for a task that lacks direct productive +output. In this paper, we propose Proof-of-Collaborative-Learning (PoCL), a +multi-winner federated learning validated consensus mechanism that redirects +the computation power of blockchains to train federated learning models. In +addition, we present a novel evaluation mechanism to ensure the efficiency of +the locally trained models of miners. We evaluated the security of our +evaluation mechanism by introducing and conducting probable attacks. Moreover, +we present a novel reward distribution mechanism to incentivize winning miners +fairly, and demonstrate that our reward system is fair both within and across +all rounds. + +
+
+ comment: 8 pages. Accepted at the 7th IEEE International Conference on + Blockchain (Blockchain 2024) +
+
+
+
+
+ + ♻ ☆ Causal Discovery over High-Dimensional Structured Hypothesis Spaces with + Causal Graph Partitioning + + +
+ The aim in many sciences is to understand the mechanisms that underlie the +observed distribution of variables, starting from a set of initial hypotheses. +Causal discovery allows us to infer mechanisms as sets of cause and effect +relationships in a generalized way -- without necessarily tailoring to a +specific domain. Causal discovery algorithms search over a structured +hypothesis space, defined by the set of directed acyclic graphs, to find the +graph that best explains the data. For high-dimensional problems, however, this +search becomes intractable and scalable algorithms for causal discovery are +needed to bridge the gap. In this paper, we define a novel causal graph +partition that allows for divide-and-conquer causal discovery with theoretical +guarantees. We leverage the idea of a superstructure -- a set of learned or +existing candidate hypotheses -- to partition the search space. We prove under +certain assumptions that learning with a causal graph partition always yields +the Markov Equivalence Class of the true causal graph. We show our algorithm +achieves comparable accuracy and a faster time to solution for +biologically-tuned synthetic networks and networks up to ${10^4}$ variables. +This makes our method applicable to gene regulatory network inference and other +domains with high-dimensional structured hypothesis spaces. + +
+
+
+
+
+ + ♻ ☆ Scalable mRMR feature selection to handle high dimensional datasets: + Vertical partitioning based Iterative MapReduce framework + + +
+ While building machine learning models, Feature selection (FS) stands out as +an essential preprocessing step used to handle the uncertainty and vagueness in +the data. Recently, the minimum Redundancy and Maximum Relevance (mRMR) +approach has proven to be effective in obtaining the irredundant feature +subset. Owing to the generation of voluminous datasets, it is essential to +design scalable solutions using distributed/parallel paradigms. MapReduce +solutions are proven to be one of the best approaches to designing +fault-tolerant and scalable solutions. This work analyses the existing +MapReduce approaches for mRMR feature selection and identifies the limitations +thereof. In the current study, we proposed VMR_mRMR, an efficient vertical +partitioning-based approach using a memorization approach, thereby overcoming +the extant approaches limitations. The experiment analysis says that VMR_mRMR +significantly outperformed extant approaches and achieved a better +computational gain (C.G). In addition, we also conducted a comparative analysis +with the horizontal partitioning approach HMR_mRMR [1] to assess the strengths +and limitations of the proposed approach. + +
+
+ comment: 20 pages, 3 Figures, 5 Tables +
+
+
+
+
+ + ♻ ☆ Computational Power of Mobile Robots in Synchronous Environment: + Discrete Version + + +
+ In distributed computing by mobile robots, robots are deployed over a region, +continuous or discrete, operating through a sequence of +\textit{look-compute-move} cycles. An extensive study has been carried out to +understand the computational powers of different robot models. The models vary +on the ability to 1)~remember constant size information and 2)~communicate +constant size message. Depending on the abilities the different models are +1)~$\mathcal{OBLOT}$ (robots are oblivious and silent), 2)~$\mathcal{FSTA}$ +(robots have finite states but silent), 3)~$\mathcal{FCOM}$ (robots are +oblivious but can communicate constant size information) and, +4)~$\mathcal{LUMI}$ (robots have finite states and can communicate constant +size information). Another factor that affects computational ability is the +scheduler that decides the activation time of the robots. The main three +schedulers are \textit{fully-synchronous}, \textit{semi-synchronous} and +\textit{asynchronous}. Combining the models ($M$) with schedulers ($K$), we +have twelve combinations $M^K$. + In the euclidean domain, the comparisons between these twelve variants have +been done in different works for transparent robots, opaque robots, and robots +with limited visibility. There is a vacant space for similar works when robots +are operating on discrete regions like networks. It demands separate research +attention because there have been a series of works where robots operate on +different networks, and there is a fundamental difference when robots are +operating on a continuous domain versus a discrete domain in terms of robots' +movement. This work contributes to filling the space by giving a full +comparison table for all models with two synchronous schedulers: +fully-synchronous and semi-synchronous. + +
+
+
+
+
+ + ♻ ☆ Worst-Case Input Generation for Concurrent Programs under Non-Monotone + Resource Metrics + + +
+ Worst-case input generation aims to automatically generate inputs that +exhibit the worst-case performance of programs. It has several applications, +and can, for example, detect vulnerabilities to denial-of-service attacks. +However, it is non-trivial to generate worst-case inputs for concurrent +programs, particularly for resources like memory where the peak cost depends on +how processes are scheduled. + This article presents the first sound worst-case input generation algorithm +for concurrent programs under non-monotone resource metrics like memory. The +key insight is to leverage resource-annotated session types and symbolic +execution. Session types describe communication protocols on channels in +process calculi. Equipped with resource annotations, resource-annotated session +types not only encode cost bounds but also indicate how many resources can be +reused and transferred between processes. This information is critical for +identifying a worst-case execution path during symbolic execution. The +algorithm is sound: if it returns any input, it is guaranteed to be a valid +worst-case input. The algorithm is also relatively complete: as long as +resource-annotated session types are sufficiently expressive and the background +theory for SMT solving is decidable, a worst-case input is guaranteed to be +returned. A simple case study of a web server's memory usage demonstrates the +utility of the worst-case input generation algorithm. + +
+
+
+
+
+
+
+
+ + Hardware Architecturea 3 + +
+
+
+ + ☆ An FPGA-Based Open-Source Hardware-Software Framework for Side-Channel + Security Research + + +
+ Attacks based on side-channel analysis (SCA) pose a severe security threat to +modern computing platforms, further exacerbated on IoT devices by their +pervasiveness and handling of private and critical data. Designing +SCA-resistant computing platforms requires a significant additional effort in +the early stages of the IoT devices' life cycle, which is severely constrained +by strict time-to-market deadlines and tight budgets. This manuscript +introduces a hardware-software framework meant for SCA research on FPGA +targets. It delivers an IoT-class system-on-chip (SoC) that includes a RISC-V +CPU, provides observability and controllability through an ad-hoc debug +infrastructure to facilitate SCA attacks and evaluate the platform's security, +and streamlines the deployment of SCA countermeasures through dedicated +hardware and software features such as a DFS actuator and FreeRTOS support. The +open-source release of the framework includes the SoC, the scripts to configure +the computing platform, compile a target application, and assess the SCA +security, as well as a suite of state-of-the-art SCA attacks and +countermeasures. The goal is to foster its adoption and novel developments in +the field, empowering designers and researchers to focus on studying SCA +countermeasures and attacks while relying on a sound and stable +hardware-software platform as the foundation for their research. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ The Magnificent Seven Challenges and Opportunities in Domain-Specific + Accelerator Design for Autonomous Systems + + +
+ The end of Moore's Law and Dennard Scaling has combined with advances in +agile hardware design to foster a golden age of domain-specific acceleration. +However, this new frontier of computing opportunities is not without pitfalls. +As computer architects approach unfamiliar domains, we have seen common themes +emerge in the challenges that can hinder progress in the development of useful +acceleration. In this work, we present the Magnificent Seven Challenges in +domain-specific accelerator design that can guide adventurous architects to +contribute meaningfully to novel application domains. Although these challenges +appear across domains ranging from ML to genomics, we examine them through the +lens of autonomous systems as a motivating example in this work. To that end, +we identify opportunities for the path forward in a successful domain-specific +accelerator design from these challenges. + +
+
+ comment: Presented at DAC 2024 +
+
+
+
+
+ + ☆ An Energy-Efficient Artefact Detection Accelerator on FPGAs for + Hyper-Spectral Satellite Imagery + + +
+ Hyper-Spectral Imaging (HSI) is a crucial technique for analysing remote +sensing data acquired from Earth observation satellites. The rich spatial and +spectral information obtained through HSI allows for better characterisation +and exploration of the Earth's surface over traditional techniques like RGB and +Multi-Spectral imaging on the downlinked image data at ground stations. +Sometimes, these images do not contain meaningful information due to the +presence of clouds or other artefacts, limiting their usefulness. Transmission +of such artefact HSI images leads to wasteful use of already scarce energy and +time costs required for communication. While detecting such artefacts before +transmitting the HSI image is desirable, the computational complexity of these +algorithms and the limited power budget on satellites (especially CubeSats) are +key constraints. This paper presents an unsupervised learning-based +convolutional autoencoder (CAE) model for artefact identification of acquired +HSI images at the satellite and a deployment architecture on AMD's Zynq +Ultrascale FPGAs. The model is trained and tested on widely used HSI image +datasets: Indian Pines, Salinas Valley, the University of Pavia and the Kennedy +Space Center. For deployment, the model is quantised to 8-bit precision, +fine-tuned using the Vitis-AI framework and integrated as a subordinate +accelerator using AMD's Deep-Learning Processing Units (DPU) instance on the +Zynq device. Our tests show that the model can process each spectral band in an +HSI image in 4 ms, 2.6x better than INT8 inference on Nvidia's Jetson platform +& 1.27x better than SOTA artefact detectors. Our model also achieves an +f1-score of 92.8% and FPR of 0% across the dataset, while consuming 21.52 mJ +per HSI image, 3.6x better than INT8 Jetson inference & 7.5x better than SOTA +artefact detectors, making it a viable architecture for deployment in CubeSats. + +
+
+
+
+
+
+
+
+ + Programming and Languages 3 + +
+
+
+ + ♻ ☆ The Elements of Differentiable Programming + + +
+ Artificial intelligence has recently experienced remarkable advances, fueled +by large models, vast datasets, accelerated hardware, and, last but not least, +the transformative power of differentiable programming. This new programming +paradigm enables end-to-end differentiation of complex computer programs +(including those with control flows and data structures), making gradient-based +optimization of program parameters possible. As an emerging paradigm, +differentiable programming builds upon several areas of computer science and +applied mathematics, including automatic differentiation, graphical models, +optimization and statistics. This book presents a comprehensive review of the +fundamental concepts useful for differentiable programming. We adopt two main +perspectives, that of optimization and that of probability, with clear +analogies between the two. Differentiable programming is not merely the +differentiation of programs, but also the thoughtful design of programs +intended for differentiation. By making programs differentiable, we inherently +introduce probability distributions over their execution, providing a means to +quantify the uncertainty associated with program outputs. + +
+
+ comment: Draft version 2 +
+
+
+
+
+ + ♻ ☆ Worst-Case Input Generation for Concurrent Programs under Non-Monotone + Resource Metrics + + +
+ Worst-case input generation aims to automatically generate inputs that +exhibit the worst-case performance of programs. It has several applications, +and can, for example, detect vulnerabilities to denial-of-service attacks. +However, it is non-trivial to generate worst-case inputs for concurrent +programs, particularly for resources like memory where the peak cost depends on +how processes are scheduled. + This article presents the first sound worst-case input generation algorithm +for concurrent programs under non-monotone resource metrics like memory. The +key insight is to leverage resource-annotated session types and symbolic +execution. Session types describe communication protocols on channels in +process calculi. Equipped with resource annotations, resource-annotated session +types not only encode cost bounds but also indicate how many resources can be +reused and transferred between processes. This information is critical for +identifying a worst-case execution path during symbolic execution. The +algorithm is sound: if it returns any input, it is guaranteed to be a valid +worst-case input. The algorithm is also relatively complete: as long as +resource-annotated session types are sufficiently expressive and the background +theory for SMT solving is decidable, a worst-case input is guaranteed to be +returned. A simple case study of a web server's memory usage demonstrates the +utility of the worst-case input generation algorithm. + +
+
+
+
+
+ + ♻ ☆ Higher Order Automatic Differentiation of Higher Order Functions + + +
+ We present semantic correctness proofs of automatic differentiation (AD). We +consider a forward-mode AD method on a higher order language with algebraic +data types, and we characterise it as the unique structure preserving macro +given a choice of derivatives for basic operations. We describe a rich +semantics for differentiable programming, based on diffeological spaces. We +show that it interprets our language, and we phrase what it means for the AD +method to be correct with respect to this semantics. We show that our +characterisation of AD gives rise to an elegant semantic proof of its +correctness based on a gluing construction on diffeological spaces. We explain +how this is, in essence, a logical relations argument. Throughout, we show how +the analysis extends to AD methods for computing higher order derivatives using +a Taylor approximation. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2001.02209 +
+
+
+
+
+
+
+
+ + Computational Complexity 2 + +
+
+
+ + ☆ Solving The Travelling Salesman Problem Using A Single Qubit + + +
+ The travelling salesman problem (TSP) is a popular NP-hard-combinatorial +optimization problem that requires finding the optimal way for a salesman to +travel through different cities once and return to the initial city. The +existing methods of solving TSPs on quantum systems are either gate-based or +binary variable-based encoding. Both approaches are resource-expensive in terms +of the number of qubits while performing worse compared to existing classical +algorithms even for small-size problems. We present an algorithm that solves an +arbitrary TSP using a single qubit by invoking the principle of quantum +parallelism. The cities are represented as quantum states on the Bloch sphere +while the preparation of superposition states allows us to traverse multiple +paths at once. The underlying framework of our algorithm is a quantum version +of the classical Brachistochrone approach. Optimal control methods are employed +to create a selective superposition of the quantum states to find the shortest +route of a given TSP. The numerical simulations solve a sample of four to nine +cities for which exact solutions are obtained. The algorithm can be implemented +on any quantum platform capable of efficiently rotating a qubit and allowing +state tomography measurements. For the TSP problem sizes considered in this +work, our algorithm is more resource-efficient and accurate than existing +quantum algorithms with the potential for scalability. A potential speed-up of +polynomial time over classical algorithms is discussed. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Efficient Convex Optimization Requires Superlinear Memory + + +
+ We show that any memory-constrained, first-order algorithm which minimizes +$d$-dimensional, $1$-Lipschitz convex functions over the unit ball to +$1/\mathrm{poly}(d)$ accuracy using at most $d^{1.25 - \delta}$ bits of memory +must make at least $\tilde{\Omega}(d^{1 + (4/3)\delta})$ first-order queries +(for any constant $\delta \in [0, 1/4]$). Consequently, the performance of such +memory-constrained algorithms are a polynomial factor worse than the optimal +$\tilde{O}(d)$ query bound for this problem obtained by cutting plane methods +that use $\tilde{O}(d^2)$ memory. This resolves a COLT 2019 open problem of +Woodworth and Srebro. + +
+
+ comment: 33 pages, 1 figure +
+
+
+
+
+
+
+
+ + Performance Profiling 1 + +
+
+
+ + ☆ SAfEPaTh: A System-Level Approach for Efficient Power and Thermal + Estimation of Convolutional Neural Network Accelerator + + +
+ The design of energy-efficient, high-performance, and reliable Convolutional +Neural Network (CNN) accelerators involves significant challenges due to +complex power and thermal management issues. This paper introduces SAfEPaTh, a +novel system-level approach for accurately estimating power and temperature in +tile-based CNN accelerators. By addressing both steady-state and +transient-state scenarios, SAfEPaTh effectively captures the dynamic effects of +pipeline bubbles in interlayer pipelines, utilizing real CNN workloads for +comprehensive evaluation. Unlike traditional methods, it eliminates the need +for circuit-level simulations or on-chip measurements. Our methodology +leverages TANIA, a cutting-edge hybrid digital-analog tile-based accelerator +featuring analog-in-memory computing cores alongside digital cores. Through +rigorous simulation results using the ResNet18 model, we demonstrate SAfEPaTh's +capability to accurately estimate power and temperature within 500 seconds, +encompassing CNN model accelerator mapping exploration and detailed power and +thermal estimations. This efficiency and accuracy make SAfEPaTh an invaluable +tool for designers, enabling them to optimize performance while adhering to +stringent power and thermal constraints. Furthermore, SAfEPaTh's adaptability +extends its utility across various CNN models and accelerator architectures, +underscoring its broad applicability in the field. This study contributes +significantly to the advancement of energy-efficient and reliable CNN +accelerator designs, addressing critical challenges in dynamic power and +thermal management. + +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 2 + +
+
+
+ + ☆ Regular language quantum states + + +
+ We introduce regular language states, a family of quantum many-body states. +They are built from a special class of formal languages, called regular, which +has been thoroughly studied in the field of computer science. They can be +understood as the superposition of all the words in a regular language and +encompass physically relevant states such as the GHZ-, W- or Dicke-states. By +leveraging the theory of regular languages, we develop a theoretical framework +to describe them. First, we express them in terms of matrix product states, +providing efficient criteria to recognize them. We then develop a canonical +form which allows us to formulate a fundamental theorem for the equivalence of +regular language states, including under local unitary operations. We also +exploit the theory of tensor networks to find an efficient criterion to +determine when regular languages are shift-invariant. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ☆ A process algebraic framework for multi-agent dynamic epistemic systems + + +
+ This paper combines the classical model of labeled transition systems with +the epistemic model for reasoning about knowledge. The result is a unifying +framework for modeling and analyzing multi-agent, knowledge-based, dynamic +systems. On the modeling side, we propose a process algebraic, agent-oriented +specification language that makes such a framework easy to use for practical +purposes. On the verification side, we define a modal logic encompassing +temporal and epistemic operators. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Distributed, Parallel, and Cluster Computing 16 + +
+
+
+ + ☆ ExaWorks Software Development Kit: A Robust and Scalable Collection of + Interoperable Workflow Technologies + + +
+ Scientific discovery increasingly requires executing heterogeneous scientific +workflows on high-performance computing (HPC) platforms. Heterogeneous +workflows contain different types of tasks (e.g., simulation, analysis, and +learning) that need to be mapped, scheduled, and launched on different +computing. That requires a software stack that enables users to code their +workflows and automate resource management and workflow execution. Currently, +there are many workflow technologies with diverse levels of robustness and +capabilities, and users face difficult choices of software that can effectively +and efficiently support their use cases on HPC machines, especially when +considering the latest exascale platforms. We contributed to addressing this +issue by developing the ExaWorks Software Development Kit (SDK). The SDK is a +curated collection of workflow technologies engineered following current best +practices and specifically designed to work on HPC platforms. We present our +experience with (1) curating those technologies, (2) integrating them to +provide users with new capabilities, (3) developing a continuous integration +platform to test the SDK on DOE HPC platforms, (4) designing a dashboard to +publish the results of those tests, and (5) devising an innovative +documentation platform to help users to use those technologies. Our experience +details the requirements and the best practices needed to curate workflow +technologies, and it also serves as a blueprint for the capabilities and +services that DOE will have to offer to support a variety of scientific +heterogeneous workflows on the newly available exascale HPC platforms. + +
+
+
+
+
+ + ☆ COALA: A Practical and Vision-Centric Federated Learning Platform + + +
+ We present COALA, a vision-centric Federated Learning (FL) platform, and a +suite of benchmarks for practical FL scenarios, which we categorize into three +levels: task, data, and model. At the task level, COALA extends support from +simple classification to 15 computer vision tasks, including object detection, +segmentation, pose estimation, and more. It also facilitates federated +multiple-task learning, allowing clients to tackle multiple tasks +simultaneously. At the data level, COALA goes beyond supervised FL to benchmark +both semi-supervised FL and unsupervised FL. It also benchmarks feature +distribution shifts other than commonly considered label distribution shifts. +In addition to dealing with static data, it supports federated continual +learning for continuously changing data in real-world scenarios. At the model +level, COALA benchmarks FL with split models and different models in different +clients. COALA platform offers three degrees of customization for these +practical FL scenarios, including configuration customization, components +customization, and workflow customization. We conduct systematic benchmarking +experiments for the practical FL scenarios and highlight potential +opportunities for further advancements in FL. Codes are open sourced at +https://github.com/SonyResearch/COALA. + +
+
+ comment: ICML'24 +
+
+
+
+
+ + ☆ DRAM Errors and Cosmic Rays: Space Invaders or Science Fiction? + + +
+ It is widely accepted that cosmic rays are a plausible cause of DRAM errors +in high-performance computing (HPC) systems, and various studies suggest that +they could explain some aspects of the observed DRAM error behavior. However, +this phenomenon is insufficiently studied in production environments. We +analyze the correlations between cosmic rays and DRAM errors on two HPC +clusters: a production supercomputer with server-class DDR3-1600 and a +prototype with LPDDR3-1600 and no hardware error correction. Our error logs +cover 2000 billion MB-hours for the MareNostrum 3 supercomputer and 135 million +MB-hours for the Mont-Blanc prototype. Our analysis combines quantitative +analysis, formal statistical methods and machine learning. We detect no +indications that cosmic rays have any influence on the DRAM errors. To +understand whether the findings are specific to systems under study, located at +100 meters above the sea level, the analysis should be repeated on other HPC +clusters, especially the ones located on higher altitudes. Also, analysis can +(and should) be applied to revisit and extend numerous previous studies which +use cosmic rays as a hypothetical explanation for some aspects of the observed +DRAM error behaviors. + +
+
+
+
+
+ + ☆ Prisec II -- A Comprehensive Model for IoT Security: Cryptographic + Algorithms and Cloud Integration + + +
+ This study addresses the critical issue of ensuring data security and +efficiency in interconnected devices, especially in IoT environments. The +objective is to design and implement a model using cryptographic algorithms to +enhance data security in 5G networks. Challenges arise from the limited +computational capabilities of IoT devices, which require the analysis and +selection of cryptographic algorithms to achieve efficient data transmission. +This study proposes a model that includes four levels of security, each +employing different levels of encryption to provide better data security. +Finally, cloud computing optimizes processing efficiency and resource +utilization to improve data transmission. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Reinforcement Learning-based Adaptive Mitigation of Uncorrected DRAM + Errors in the Field + + +
+ Scaling to larger systems, with current levels of reliability, requires +cost-effective methods to mitigate hardware failures. One of the main causes of +hardware failure is an uncorrected error in memory, which terminates the +current job and wastes all computation since the last checkpoint. This paper +presents the first adaptive method for triggering uncorrected error mitigation. +It uses a prediction approach that considers the likelihood of an uncorrected +error and its current potential cost. The method is based on reinforcement +learning, and the only user-defined parameters are the mitigation cost and +whether the job can be restarted from a mitigation point. We evaluate our +method using classical machine learning metrics together with a cost-benefit +analysis, which compares the cost of mitigation actions with the benefits from +mitigating some of the errors. On two years of production logs from the +MareNostrum supercomputer, our method reduces lost compute time by 54% compared +with no mitigation and is just 6% below the optimal Oracle method. All source +code is open source. + +
+
+ comment: Published in HPDC'24 +
+
+
+
+
+ + ☆ Sizey: Memory-Efficient Execution of Scientific Workflow Tasks + + +
+ As the amount of available data continues to grow in fields as diverse as +bioinformatics, physics, and remote sensing, the importance of scientific +workflows in the design and implementation of reproducible data analysis +pipelines increases. When developing workflows, resource requirements must be +defined for each type of task in the workflow. Typically, task types vary +widely in their computational demands because they are simply wrappers for +arbitrary black-box analysis tools. Furthermore, the resource consumption for +the same task type can vary considerably as well due to different inputs. Since +underestimating memory resources leads to bottlenecks and task failures, +workflow developers tend to overestimate memory resources. However, +overprovisioning of memory wastes resources and limits cluster throughput. + Addressing this problem, we propose Sizey, a novel online memory prediction +method for workflow tasks. During workflow execution, Sizey simultaneously +trains multiple machine learning models and then dynamically selects the best +model for each workflow task. To evaluate the quality of the model, we +introduce a novel resource allocation quality (RAQ) score based on memory +prediction accuracy and efficiency. Sizey's prediction models are retrained and +re-evaluated online during workflow execution, continuously incorporating +metrics from completed tasks. + Our evaluation with a prototype implementation of Sizey uses metrics from six +real-world scientific workflows from the popular nf-core framework and shows a +median reduction in memory waste over time of 24.68% compared to the respective +best-performing state-of-the-art baseline. + +
+
+ comment: Paper accepted in 2024 IEEE International Conference on Cluster + Computing (CLUSTER) +
+
+
+
+
+ + ☆ A Programming Model for Disaggregated Memory over CXL + + +
+ CXL (Compute Express Link) is an emerging open industry-standard interconnect +between processing and memory devices that is expected to revolutionize the way +systems are designed in the near future. It enables cache-coherent shared +memory pools in a disaggregated fashion at unprecedented scales, allowing +algorithms to interact with a variety of storage devices using simple loads and +stores in a cacheline granularity. Alongside with unleashing unique +opportunities for a wide range of applications, CXL introduces new challenges +of data management and crash consistency. Alas, CXL lacks an adequate +programming model, which makes reasoning about the correctness and expected +behaviors of algorithms and systems on top of it nearly impossible. + In this work, we present CXL0, the first programming model for concurrent +programs running on top of CXL. We propose a high-level abstraction for CXL +memory accesses and formally define operational semantics on top of that +abstraction. We provide a set of general transformations that adapt concurrent +algorithms to the new disruptive technology. Using these transformations, every +linearizable algorithm can be easily transformed into its provably correct +version in the face of a full-system or sub-system crash. We believe that this +work will serve as the stepping stone for systems design and modelling on top +of CXL, and support the development of future models as software and hardware +evolve. + +
+
+
+
+
+ + ☆ Inference Load-Aware Orchestration for Hierarchical Federated Learning + + +
+ Hierarchical federated learning (HFL) designs introduce intermediate +aggregator nodes between clients and the global federated learning server in +order to reduce communication costs and distribute server load. One side effect +is that machine learning model replication at scale comes "for free" as part of +the HFL process: model replicas are hosted at the client end, intermediate +nodes, and the global server level and are readily available for serving +inference requests. This creates opportunities for efficient model serving but +simultaneously couples the training and serving processes and calls for their +joint orchestration. This is particularly important for continual learning, +where serving a model while (re)training it periodically, upon specific +triggers, or continuously, takes place over shared infrastructure spanning the +computing continuum. Consequently, training and inference workloads can +interfere with detrimental effects on performance. To address this issue, we +propose an inference load-aware HFL orchestration scheme, which makes informed +decisions on HFL configuration, considering knowledge about inference workloads +and the respective processing capacity. Applying our scheme to a continual +learning use case in the transportation domain, we demonstrate that by +optimizing aggregator node placement and device-aggregator association, +significant inference latency savings can be achieved while communication costs +are drastically reduced compared to flat centralized federated learning. + +
+
+
+
+
+ + ☆ Distributed Difference of Convex Optimization + + +
+ In this article, we focus on solving a class of distributed optimization +problems involving $n$ agents with the local objective function at every agent +$i$ given by the difference of two convex functions $f_i$ and $g_i$ +(difference-of-convex (DC) form), where $f_i$ and $g_i$ are potentially +nonsmooth. The agents communicate via a directed graph containing $n$ nodes. We +create smooth approximations of the functions $f_i$ and $g_i$ and develop a +distributed algorithm utilizing the gradients of the smooth surrogates and a +finite-time approximate consensus protocol. We term this algorithm as +DDC-Consensus. The developed DDC-Consensus algorithm allows for non-symmetric +directed graph topologies and can be synthesized distributively. We establish +that the DDC-Consensus algorithm converges to a stationary point of the +nonconvex distributed optimization problem. The performance of the +DDC-Consensus algorithm is evaluated via a simulation study to solve a +nonconvex DC-regularized distributed least squares problem. The numerical +results corroborate the efficacy of the proposed algorithm. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A simple and fast C++ thread pool implementation capable of running task + graphs + + +
+ In this paper, the author presents a simple and fast C++ thread pool +implementation capable of running task graphs. The implementation is publicly +available on GitHub, see https://github.com/dpuyda/scheduling. + +
+
+
+
+
+ + ♻ ☆ SemiSFL: Split Federated Learning on Unlabeled and Non-IID Data + + +
+ Federated Learning (FL) has emerged to allow multiple clients to +collaboratively train machine learning models on their private data at the +network edge. However, training and deploying large-scale models on +resource-constrained devices is challenging. Fortunately, Split Federated +Learning (SFL) offers a feasible solution by alleviating the computation and/or +communication burden on clients. However, existing SFL works often assume +sufficient labeled data on clients, which is usually impractical. Besides, data +non-IIDness poses another challenge to ensure efficient model training. To our +best knowledge, the above two issues have not been simultaneously addressed in +SFL. Herein, we propose a novel Semi-supervised SFL system, termed SemiSFL, +which incorporates clustering regularization to perform SFL with unlabeled and +non-IID client data. Moreover, our theoretical and experimental investigations +into model convergence reveal that the inconsistent training processes on +labeled and unlabeled data have an influence on the effectiveness of clustering +regularization. To mitigate the training inconsistency, we develop an algorithm +for dynamically adjusting the global updating frequency, so as to improve +training performance. Extensive experiments on benchmark models and datasets +show that our system provides a 3.8x speed-up in training time, reduces the +communication cost by about 70.3% while reaching the target accuracy, and +achieves up to 5.8% improvement in accuracy under non-IID scenarios compared to +the state-of-the-art baselines. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Rendering Wireless Environments Useful for Gradient Estimators: A + Zero-Order Stochastic Federated Learning Method + + +
+ Cross-device federated learning (FL) is a growing machine learning setting +whereby multiple edge devices collaborate to train a model without disclosing +their raw data. With the great number of mobile devices participating in more +FL applications via the wireless environment, the practical implementation of +these applications will be hindered due to the limited uplink capacity of +devices, causing critical bottlenecks. In this work, we propose a novel doubly +communication-efficient zero-order (ZO) method with a one-point gradient +estimator that replaces communicating long vectors with scalar values and that +harnesses the nature of the wireless communication channel, overcoming the need +to know the channel state coefficient. It is the first method that includes the +wireless channel in the learning algorithm itself instead of wasting resources +to analyze it and remove its impact. We then offer a thorough analysis of the +proposed zero-order federated learning (ZOFL) framework and prove that our +method converges \textit{almost surely}, which is a novel result in nonconvex +ZO optimization. We further prove a convergence rate of +$O(\frac{1}{\sqrt[3]{K}})$ in the nonconvex setting. We finally demonstrate the +potential of our algorithm with experimental results. + +
+
+
+
+
+ + ♻ ☆ FedPop: Federated Population-based Hyperparameter Tuning + + +
+ Federated Learning (FL) is a distributed machine learning (ML) paradigm, in +which multiple clients collaboratively train ML models without centralizing +their local data. Similar to conventional ML pipelines, the client local +optimization and server aggregation procedure in FL are sensitive to the +hyperparameter (HP) selection. Despite extensive research on tuning HPs for +centralized ML, these methods yield suboptimal results when employed in FL. +This is mainly because their "training-after-tuning" framework is unsuitable +for FL with limited client computation power. While some approaches have been +proposed for HP-Tuning in FL, they are limited to the HPs for client local +updates. In this work, we propose a novel HP-tuning algorithm, called Federated +Population-based Hyperparameter Tuning (FedPop), to address this vital yet +challenging problem. FedPop employs population-based evolutionary algorithms to +optimize the HPs, which accommodates various HP types at both the client and +server sides. Compared with prior tuning methods, FedPop employs an online +"tuning-while-training" framework, offering computational efficiency and +enabling the exploration of a broader HP search space. Our empirical validation +on the common FL benchmarks and complex real-world FL datasets, including +full-sized Non-IID ImageNet-1K, demonstrates the effectiveness of the proposed +method, which substantially outperforms the concurrent state-of-the-art +HP-tuning methods in FL. + +
+
+ comment: Code: https://github.com/HaokunChen245/FedPop +
+
+
+
+
+ + ♻ ☆ Active Admission Control in a P2P Distributed Environment for Capacity + Efficient Livestreaming in Mobile Wireless Networks SC + + +
+ In this study, the Active Control in an Intelligent and Distributed +Environment (ACIDE) media distribution model solution and algorithms are +proposed for livestreaming in capacity efficient mobile wireless networks. The +elements of the ACIDE model are a base station and a cluster formed by a number +of peers able to establish peer to peer communications. The cluster peers are +selected from a group of users interested in livestreaming the same media. The +ACIDE model solution minimizes the bandwidth allocated to a cluster of n peers +such that an uninterrupted media play for all peers is guaranteed. The +livestream media is sent to the peers in packages and every media package is +divided into n blocks. The blocks are distributed to the n peers of a cluster +in two phases, such that the base station bandwidth is utilized during first +phase only. The allocated bandwidth, the amount of bandwidth the base station +has to allocate to a cluster, is minimized and its lower bound is equal to the +bandwidth required for multicasting. In this study, the ACIDE model is used to +address the problem of how to find the maximum number of peers n, chosen from a +group of N users, that can be admitted to a cluster knowing the given allocated +bandwidth, the amount of bandwidth that a base station allocates to a cluster +in advance, prior to admitting users. When users become peers of an ACIDE +cluster, the network capacity, the total number of users who are able to access +live media, increases meaning that network resources are used more efficiently. +The problem of finding the maximum number of peers n is addressed as an +optimization problem, with the objective of having the entire given allocated +bandwidth used by the peers admitted to the cluster. This problem is +NP-complete and a non-optimal solution is proposed for peers selection such +that all admitted peers play media continuously. + +
+
+ comment: 8 pages, 6 figures, 3 tables; Accepted for publication in: + Proceedings of the 2023 International Conference on Computational Science and + Computational Intelligence (CSCI'23: December 13-15, 2023, Las Vegas, Nevada, + USA); Publisher: IEEE Computer Society (CPS) +
+
+
+
+
+ + ♻ ☆ Refined Bitcoin Security-Latency Under Network Delay + + +
+ We study security-latency bounds for Nakamoto consensus, i.e., how secure a +block is after it becomes $k$-deep in the chain. We improve the +state-of-the-art bounds by analyzing the race between adversarial and honest +chains in three different phases. We find the probability distribution of the +growth of the adversarial chains under models similar to those in [Guo, Ren; +AFT 2022] when a target block becomes $k$-deep in the chain. We analyze certain +properties of this race to model each phase with random walks that provide +tighter bounds than the existing results. Combining all three phases provides +novel upper and lower bounds for blockchains with small $\lambda\Delta$. + +
+
+
+
+
+ + ♻ ☆ Portable, heterogeneous ensemble workflows at scale using libEnsemble + + +
+ libEnsemble is a Python-based toolkit for running dynamic ensembles, +developed as part of the DOE Exascale Computing Project. The toolkit utilizes a +unique generator--simulator--allocator paradigm, where generators produce input +for simulators, simulators evaluate those inputs, and allocators decide whether +and when a simulator or generator should be called. The generator steers the +ensemble based on simulation results. Generators may, for example, apply +methods for numerical optimization, machine learning, or statistical +calibration. libEnsemble communicates between a manager and workers. We +overview the unique characteristics of libEnsemble as well as current and +potential interoperability with other packages in the workflow ecosystem. We +highlight libEnsemble's dynamic resource features: libEnsemble can detect +system resources, such as available nodes, cores, and GPUs, and assign these in +a portable way. These features allow users to specify the number of processors +and GPUs required for each simulation; and resources will be automatically +assigned on a wide range of systems, including Frontier, Aurora, and +Perlmutter. Such ensembles can include multiple simulation types, some using +GPUs and others using only CPUs, sharing nodes for maximum efficiency. We also +describe the benefits of libEnsemble's generator--simulator coupling, which +easily exposes to the user the ability to cancel, and portably kill, running +simulations based on models that are updated with intermediate simulation +output. We demonstrate libEnsemble's capabilities, scalability, and scientific +impact via a Gaussian process surrogate training problem for the longitudinal +density profile at the exit of a plasma accelerator stage. The study uses gpCAM +for the surrogate model and employs either Wake-T or WarpX simulations, +highlighting efficient use of resources that can easily extend to exascale. + +
+
+
+
+
+
+
+
+ + Hardware Architecturea 2 + +
+
+
+ + ☆ OriGen:Enhancing RTL Code Generation with Code-to-Code Augmentation and + Self-Reflection + + +
+ Recent studies have illuminated that Large Language Models (LLMs) exhibit +substantial potential in the realm of RTL (Register Transfer Level) code +generation, with notable advancements evidenced by commercial models such as +GPT-4 and Claude3-Opus. Despite their proficiency, these commercial LLMs often +raise concerns regarding privacy and security. Conversely, open-source LLMs, +which offer solutions to these concerns, have inferior performance in RTL code +generation tasks to commercial models due to the lack of highquality +open-source RTL datasets. To address this issue, we introduce OriGen, a fully +open-source framework featuring self-reflection capabilities and a dataset +augmentation methodology for generating high-quality, large-scale RTL code. We +propose a novel code-to-code augmentation methodology that leverages knowledge +distillation to enhance the quality of the open-source RTL code datasets. +Additionally, OriGen is capable of correcting syntactic errors by leveraging a +self-reflection process based on feedback from the compiler. The +self-reflection ability of the model is facilitated by a carefully constructed +dataset, which comprises a comprehensive collection of samples. Experimental +results demonstrate that OriGen remarkably outperforms other open-source +alternatives in RTL code generation, surpassing the previous best-performing +LLM by 9.8% on the VerilogEval-Human benchmark. Furthermore, OriGen exhibits +superior capabilities in self-reflection and error rectification, surpassing +GPT-4 by 18.1% on the benchmark designed to evaluate the capability of +self-reflection. + +
+
+
+
+
+ + ☆ Rome was Not Built in a Single Step: Hierarchical Prompting for + LLM-based Chip Design + + +
+ Large Language Models (LLMs) are effective in computer hardware synthesis via +hardware description language (HDL) generation. However, LLM-assisted +approaches for HDL generation struggle when handling complex tasks. We +introduce a suite of hierarchical prompting techniques which facilitate +efficient stepwise design methods, and develop a generalizable automation +pipeline for the process. To evaluate these techniques, we present a benchmark +set of hardware designs which have solutions with or without architectural +hierarchy. Using these benchmarks, we compare various open-source and +proprietary LLMs, including our own fine-tuned Code Llama-Verilog model. Our +hierarchical methods automatically produce successful designs for complex +hardware modules that standard flat prompting methods cannot achieve, allowing +smaller open-source LLMs to compete with large proprietary models. Hierarchical +prompting reduces HDL generation time and yields savings on LLM costs. Our +experiments detail which LLMs are capable of which applications, and how to +apply hierarchical methods in various modes. We explore case studies of +generating complex cores using automatic scripted hierarchical prompts, +including the first-ever LLM-designed processor with no human feedback. + +
+
+ comment: Accepted at MLCAD '24. 10 pages, 7 figures, 5 tables +
+
+
+
+
+
+
+
+ + Programming and Languages 7 + +
+
+
+ + ☆ Language-Based Security for Low-Level MPC + + +
+ Secure Multi-Party Computation (MPC) is an important enabling technology for +data privacy in modern distributed applications. Currently, proof methods for +low-level MPC protocols are primarily manual and thus tedious and error-prone, +and are also non-standardized and unfamiliar to most PL theorists. As a step +towards better language support and language-based enforcement, we develop a +new staged PL for defining a variety of low-level probabilistic MPC protocols. +We also formulate a collection of confidentiality and integrity hyperproperties +for our language model that are familiar from information flow, including +conditional noninterference, gradual release, and robust declassification. We +demonstrate their relation to standard MPC threat models of passive and +malicious security, and how they can be leveraged in security verification of +protocols. To prove these properties we develop automated tactics in +$\mathbb{F}_2$ that can be integrated with separation logic-style reasoning. + +
+
+
+
+
+ + ☆ SPLAT: A framework for optimised GPU code-generation for SParse reguLar + ATtention + + +
+ Multi-head-self-attention (MHSA) mechanisms achieve state-of-the-art (SOTA) +performance across natural language processing and vision tasks. However, their +quadratic dependence on sequence lengths has bottlenecked inference speeds. To +circumvent this bottleneck, researchers have proposed various sparse-MHSA +models, where a subset of full attention is computed. Despite their promise, +current sparse libraries and compilers do not support high-performance +implementations for diverse sparse-MHSA patterns due to the underlying sparse +formats they operate on. These formats, which are typically designed for +high-performance & scientific computing applications, are either curated for +extreme amounts of random sparsity (<1% non-zero values), or specific sparsity +patterns. However, the sparsity patterns in sparse-MHSA are moderately sparse +(10-50% non-zero values) and varied, resulting in existing sparse-formats +trading off generality for performance. + We bridge this gap, achieving both generality and performance, by proposing a +novel sparse format: affine-compressed-sparse-row (ACSR) and supporting +code-generation scheme, SPLAT, that generates high-performance implementations +for diverse sparse-MHSA patterns on GPUs. Core to our proposed format and code +generation algorithm is the observation that common sparse-MHSA patterns have +uniquely regular geometric properties. These properties, which can be analyzed +just-in-time, expose novel optimizations and tiling strategies that SPLAT +exploits to generate high-performance implementations for diverse patterns. To +demonstrate SPLAT's efficacy, we use it to generate code for various +sparse-MHSA models, achieving geomean speedups of 2.05x and 4.05x over +hand-written kernels written in triton and TVM respectively on A100 GPUs. +Moreover, its interfaces are intuitive and easy to use with existing +implementations of MHSA in JAX. + +
+
+ comment: 31 pages, 16 figures +
+
+
+
+
+ + ☆ Qudit Quantum Programming with Projective Cliffords + + +
+ This paper introduces a novel abstraction for programming quantum operations, +specifically projective Cliffords, as functions over the qudit Pauli group. We +define a categorical semantics for projective Cliffords based on Pauli +encodings in terms of $\mathbb{Z}_d$-linear maps. We then introduce a type +system and lambda calculus for both $\mathbb{Z}_d$-linear maps and projective +Cliffords, and prove that these type systems have a sound denotational +semantics in terms of the relevant categories. Finally, we explore what it +means to program with projective Cliffords through a number of examples and +programming constructions. + +
+
+ comment: 42 pages +
+
+
+
+
+ + ♻ ☆ A Coq Mechanization of JavaScript Regular Expression Semantics + + +
+ We present an executable, proven-safe, faithful, and future-proof Coq +mechanization of JavaScript regular expression (regex) matching, as specified +by the last published edition of ECMA-262 section 22.2. This is, to our +knowledge, the first time that an industrial-strength regex language has been +faithfully mechanized in an interactive theorem prover. We highlight +interesting challenges that arose in the process (including issues of encoding, +corner cases, and executability), and we document the steps that we took to +ensure that the result is straightforwardly auditable and that our +understanding of the spec aligns with existing implementations. + We demonstrate the usability and versatility of the mechanization through a +broad collection of analyses, case studies, and experiments: we prove that +JavaScript regex matching always terminates and is safe (no assertion +failures); we identifying subtle corner cases that led to mistakes in previous +publications; we verify an optimization extracted from a state-of-the-art regex +engine; we show that some classic properties described in automata textbooks +and used in derivatives-based matchers do not hold in JavaScript regexes; and +we demonstrate that the cost of updating the mechanization to account for +changes in the original specification is reasonably low. + Our mechanization can be extracted to OCaml and linked with Unicode libraries +to produce an executable engine that passes the relevant parts of the official +Test262 conformance test suite. + +
+
+
+
+
+ + ♻ ☆ Linear Matching of JavaScript Regular Expressions + + +
+ Modern regex languages have strayed far from well-understood traditional +regular expressions: they include features that fundamentally transform the +matching problem. In exchange for these features, modern regex engines at times +suffer from exponential complexity blowups, a frequent source of +denial-of-service vulnerabilities in JavaScript applications. Worse, regex +semantics differ across languages, and the impact of these divergences on +algorithmic design and worst-case matching complexity has seldom been +investigated. + This paper provides a novel perspective on JavaScript's regex semantics by +identifying a larger-than-previously-understood subset of the language that can +be matched with linear time guarantees. In the process, we discover several +cases where state-of-the-art algorithms were either wrong (semantically +incorrect), inefficient (suffering from superlinear complexity) or excessively +restrictive (assuming certain features could not be matched linearly). We +introduce novel algorithms to restore correctness and linear complexity. We +further advance the state-of-the-art in linear regex matching by presenting the +first nonbacktracking algorithms for matching lookarounds in linear time: one +supporting captureless lookbehinds in any regex language, and another +leveraging a JavaScript property to support unrestricted lookaheads and +lookbehinds. Finally, we describe new time and space complexity tradeoffs for +regex engines. All of our algorithms are practical: we validated them in a +prototype implementation, and some have also been merged in the V8 JavaScript +implementation used in Chrome and Node.js. + +
+
+
+
+
+ + ♻ ☆ Typed compositional quantum computation with lenses + + +
+ We propose a type-theoretic framework for describing and proving properties +of quantum computations, in particular those presented as quantum circuits. Our +proposal is based on an observation that, in the polymorphic type system of +Coq, currying on quantum states allows us to apply quantum gates directly +inside a complex circuit. By introducing a discrete notion of lens to control +this currying, we are further able to separate the combinatorics of the circuit +structure from the computational content of gates. We apply our development to +define quantum circuits recursively from the bottom up, and prove their +correctness compositionally. + +
+
+
+
+
+ + ♻ ☆ Learning Task Decomposition to Assist Humans in Competitive Programming + + +
+ When using language models (LMs) to solve complex problems, humans might +struggle to understand the LM-generated solutions and repair the flawed ones. +To assist humans in repairing them, we propose to automatically decompose +complex solutions into multiple simpler pieces that correspond to specific +subtasks. We introduce a novel objective for learning task decomposition, +termed assistive value (AssistV), which measures the feasibility and speed for +humans to repair the decomposed solution. We collect a dataset of human repair +experiences on different decomposed solutions. Utilizing the collected data as +in-context examples, we then learn to critique, refine, and rank decomposed +solutions to improve AssistV. We validate our method under competitive +programming problems: under 177 hours of human study, our method enables +non-experts to solve 33.3\% more problems, speeds them up by 3.3x, and empowers +them to match unassisted experts. + +
+
+ comment: ACL 2024 Main Conference +
+
+
+
+
+
+
+
+ + Performance Profiling 2 + +
+
+
+ + ♻ ☆ Active Admission Control in a P2P Distributed Environment for Capacity + Efficient Livestreaming in Mobile Wireless Networks SC + + +
+ In this study, the Active Control in an Intelligent and Distributed +Environment (ACIDE) media distribution model solution and algorithms are +proposed for livestreaming in capacity efficient mobile wireless networks. The +elements of the ACIDE model are a base station and a cluster formed by a number +of peers able to establish peer to peer communications. The cluster peers are +selected from a group of users interested in livestreaming the same media. The +ACIDE model solution minimizes the bandwidth allocated to a cluster of n peers +such that an uninterrupted media play for all peers is guaranteed. The +livestream media is sent to the peers in packages and every media package is +divided into n blocks. The blocks are distributed to the n peers of a cluster +in two phases, such that the base station bandwidth is utilized during first +phase only. The allocated bandwidth, the amount of bandwidth the base station +has to allocate to a cluster, is minimized and its lower bound is equal to the +bandwidth required for multicasting. In this study, the ACIDE model is used to +address the problem of how to find the maximum number of peers n, chosen from a +group of N users, that can be admitted to a cluster knowing the given allocated +bandwidth, the amount of bandwidth that a base station allocates to a cluster +in advance, prior to admitting users. When users become peers of an ACIDE +cluster, the network capacity, the total number of users who are able to access +live media, increases meaning that network resources are used more efficiently. +The problem of finding the maximum number of peers n is addressed as an +optimization problem, with the objective of having the entire given allocated +bandwidth used by the peers admitted to the cluster. This problem is +NP-complete and a non-optimal solution is proposed for peers selection such +that all admitted peers play media continuously. + +
+
+ comment: 8 pages, 6 figures, 3 tables; Accepted for publication in: + Proceedings of the 2023 International Conference on Computational Science and + Computational Intelligence (CSCI'23: December 13-15, 2023, Las Vegas, Nevada, + USA); Publisher: IEEE Computer Society (CPS) +
+
+
+
+
+ + ♻ ☆ Parsing Gigabytes of JSON per Second + + +
+ JavaScript Object Notation or JSON is a ubiquitous data exchange format on +the Web. Ingesting JSON documents can become a performance bottleneck due to +the sheer volume of data. We are thus motivated to make JSON parsing as fast as +possible. + Despite the maturity of the problem of JSON parsing, we show that substantial +speedups are possible. We present the first standard-compliant JSON parser to +process gigabytes of data per second on a single core, using commodity +processors. We can use a quarter or fewer instructions than a state-of-the-art +reference parser like RapidJSON. Unlike other validating parsers, our software +(simdjson) makes extensive use of Single Instruction, Multiple Data (SIMD) +instructions. To ensure reproducibility, simdjson is freely available as +open-source software under a liberal license. + +
+
+ comment: software: https://github.com/lemire/simdjson +
+
+
+
+
+
+
+
+ + Computational Complexity 9 + +
+
+
+ + ☆ Regenerative Ulam-von Neumann Algorithm: An Innovative Markov chain + Monte Carlo Method for Matrix Inversion + + +
+ This paper presents an extension of the classical Ulan-von Neumann Markov +chain Monte-Carlo algorithm for the computation of the matrix inverse. The +algorithm presented in this paper, termed as \emph{regenerative Ulam-von +Neumann algorithm}, utilizes the regenerative structure of classical, +non-truncated Neumann series defined by a non-singular matrix and produces an +unbiased estimator of the matrix inverse. Furthermore, the accuracy of the +proposed algorithm depends on a single parameter that controls the total number +of Markov transitions simulated thus avoiding the challenge of balancing +between the total number of Markov chain replications and its corresponding +length as in the classical Ulam-von Neumann algorithm. To efficiently utilize +the Markov chain transition samples in the calculation of the regenerative +quantities, the proposed algorithm quantifies automatically the contribution of +each Markov transition to all regenerative quantities by a carefully designed +updating scheme that utilized three separate matrices containing the current +weights, total weights, and regenerative cycle count, respectively. A +probabilistic analysis of the performance of the algorithm, including the +variance of the estimator, is provided. Finally, numerical experiments verify +the qualitative effectiveness of the proposed scheme. + +
+
+
+
+
+ + ☆ Inference of rankings planted in random tournaments + + +
+ We consider the problem of inferring an unknown ranking of $n$ items from a +random tournament on $n$ vertices whose edge directions are correlated with the +ranking. We establish, in terms of the strength of these correlations, the +computational and statistical thresholds for detection (deciding whether an +observed tournament is purely random or drawn correlated with a hidden ranking) +and recovery (estimating the hidden ranking with small error in Spearman's +footrule or Kendall's tau metric on permutations). Notably, we find that this +problem provides a new instance of a detection-recovery gap: solving the +detection problem requires much weaker correlations than solving the recovery +problem. In establishing these thresholds, we also identify simple algorithms +for detection (thresholding a degree 2 polynomial) and recovery (outputting a +ranking by the number of "wins" of a tournament vertex, i.e., the out-degree) +that achieve optimal performance up to constants in the correlation strength. +For detection, we find that the above low-degree polynomial algorithm is +superior to a natural spectral algorithm. We also find that, whenever it is +possible to achieve strong recovery (i.e., to estimate with vanishing error in +the above metrics) of the hidden ranking, then the above "Ranking By Wins" +algorithm not only does so, but also outputs a close approximation of the +maximum likelihood estimator, a task that is NP-hard in the worst case. + +
+
+ comment: 34 pages +
+
+
+
+
+ + ♻ ☆ The Computational Advantage of MIP* Vanishes in the Presence of Noise + + +
+ Quantum multiprover interactive proof systems with entanglement MIP* are much +more powerful than its classical counterpart MIP (Babai et al. '91, Ji et al. +'20): while MIP = NEXP, the quantum class MIP* is equal to RE, a class +including the halting problem. This is because the provers in MIP* can share +unbounded quantum entanglement. However, recent works of Qin and Yao '21 and +'23 have shown that this advantage is significantly reduced if the provers' +shared state contains noise. This paper attempts to exactly characterize the +effect of noise on the computational power of quantum multiprover interactive +proof systems. We investigate the quantum two-prover one-round interactive +system MIP*[poly, O(1)], where the verifier sends polynomially many bits to the +provers and the provers send back constantly many bits. We show noise +completely destroys the computational advantage given by shared entanglement in +this model. Specifically, we show that if the provers are allowed to share +arbitrarily many noisy EPR states, where each EPR state is affected by an +arbitrarily small constant amount of noise, the resulting complexity class is +equivalent to NEXP = MIP. This improves significantly on the previous +best-known bound of NEEEXP (nondeterministic triply exponential time) by Qin +and Yao '21. We also show that this collapse in power is due to the noise, +rather than the O(1) answer size, by showing that allowing for noiseless EPR +states gives the class the full power of RE = MIP*[poly, poly]. Along the way, +we develop two technical tools of independent interest. First, we give a new, +deterministic tester for the positivity of an exponentially large matrix, +provided it has a low-degree Fourier decomposition in terms of Pauli matrices. +Secondly, we develop a new invariance principle for smooth matrix functions +having bounded third-order Fr\'echet derivatives or which are Lipschitz +continous. + +
+
+ comment: V2, updated results. Comments are welcome! +
+
+
+
+
+ + ♻ ☆ Integer Programming Using A Single Atom + + +
+ Integer programming (IP), as the name suggests is an integer-variable-based +approach commonly used to formulate real-world optimization problems with +constraints. Currently, quantum algorithms reformulate the IP into an +unconstrained form through the use of binary variables, which is an indirect +and resource-consuming way of solving it. We develop an algorithm that maps and +solves an IP problem in its original form to any quantum system possessing a +large number of accessible internal degrees of freedom that are controlled with +sufficient accuracy. This work leverages the principle of superposition to +solve the optimization problem. Using a single Rydberg atom as an example, we +associate the integer values to electronic states belonging to different +manifolds and implement a selective superposition of different states to solve +the full IP problem. The optimal solution is found within a few microseconds +for prototypical IP problems with up to eight variables and four constraints. +This also includes non-linear IP problems, which are usually harder to solve +with classical algorithms when compared to their linear counterparts. Our +algorithm for solving IP is benchmarked by a well-known classical algorithm +(branch and bound) in terms of the number of steps needed for convergence to +the solution. This approach carries the potential to improve the solutions +obtained for larger-size problems using hybrid quantum-classical algorithms. + +
+
+ comment: 20 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Delta-modular ILP Problems of Bounded Co-dimension, Discrepancy, and + Convolution + + +
+ For $k, n \geq 0$, and $c \in Z^n$, we consider ILP problems \begin{gather*} + \max\bigl\{ c^\top x \colon A x = b,\, x \in Z^n_{\geq 0} \bigr\}\text{ with +$A \in Z^{k \times n}$, $rank(A) = k$, $b \in Z^{k}$ and} + \max\bigl\{ c^\top x \colon A x \leq b,\, x \in Z^n \bigr\} \text{ with $A +\in Z^{(n+k) \times n}$, $rank(A) = n$, $b \in Z^{n+k}$.} \end{gather*} The +first problem is called an \emph{ILP problem in the standard form of the +codimension $k$}, and the second problem is called an \emph{ILP problem in the +canonical form with $n+k$ constraints.} We show that, for any sufficiently +large $\Delta$, both problems can be solved with $$ 2^{O(k)} \cdot (f_{k,d} +\cdot \Delta)^2 / 2^{\Omega\bigl(\sqrt{\log(f_{k,d} \cdot \Delta)}\bigr)} $$ +operations, where $ + f_{k,d} = \min \Bigl\{ k^{k/2}, + \bigl(\log k \cdot \log (d + k)\bigr)^{k/2} + \Bigr\} $, $d$ is the dimension of a corresponding polyhedron and $\Delta$ is +the maximum absolute value of $rank(A) \times rank(A)$ sub-determinants of $A$. + As our second main result, we show that the feasibility variants of both +problems can be solved with $$ 2^{O(k)} \cdot f_{k,d} \cdot \Delta \cdot +\log^3(f_{k,d} \cdot \Delta) $$ operations. The constant $f_{k,d}$ can be +replaced by other constant $g_{k,\Delta} = \bigl(\log k \cdot \log (k +\Delta)\bigr)^{k/2}$ that depends only on $k$ and $\Delta$. Additionally, we +consider different partial cases with $k=0$ and $k=1$, which have interesting +applications. + As a result of independent interest, we propose an +$n^2/2^{\Omega\bigl(\sqrt{\log n}\bigr)}$-time algorithm for the tropical +convolution problem on sequences, indexed by elements of a finite Abelian group +of the order $n$. Additionally, we give a complete, self-contained error +analysis of the generalized Discrete Fourier Transform for Abelian groups with +respect to the Word-RAM computational model. + +
+
+
+
+
+ + ♻ ☆ Elementary Quantum Recursion Schemes That Capture Quantum + Polylogarithmic Time Computability of Quantum Functions + + +
+ Quantum computing has been studied over the past four decades based on two +computational models of quantum circuits and quantum Turing machines. To +capture quantum polynomial-time computability, a new recursion-theoretic +approach was taken lately by Yamakami [J. Symb. Logic 80, pp.~1546--1587, 2020] +by way of recursion schematic definition, which constitutes six initial quantum +functions and three construction schemes of composition, branching, and +multi-qubit quantum recursion. By taking a similar approach, we look into +quantum polylogarithmic-time computability and further explore the expressing +power of elementary schemes designed for such quantum computation. In +particular, we introduce an elementary form of the quantum recursion, called +the fast quantum recursion, and formulate $EQS$ (elementary quantum schemes) of +``elementary'' quantum functions. This class $EQS$ captures exactly quantum +polylogarithmic-time computability, which forms the complexity class +BQPOLYLOGTIME. We also demonstrate the separation of BQPOLYLOGTIME from +NLOGTIME and PPOLYLOGTIME. As a natural extension of $EQS$, we further consider +an algorithmic procedural scheme that implements the well-known +divide-and-conquer strategy. This divide-and-conquer scheme helps compute the +parity function but the scheme cannot be realized within our system $EQS$. + +
+
+ comment: (A4, 10pt, 29 pages) This is a corrected and expanded version of the + preliminary report that has appeared, under a different title, in the + Proceedings of the 28th International Conference on Logic, Language, + Information, and Computation (WoLLIC 2022), Ia\c{s}i, Romania, September + 20--23, 2022, Lecture Notes in Computer Science, vol. 13468, pp. 88-104, + Springer, 2022 +
+
+
+
+
+ + ♻ ☆ Arborescences and Shortest Path Trees when Colors Matter + + +
+ Color-constrained subgraph problems are those where we are given an +edge-colored (directed or undirected) graph and the task is to find a specific +type of subgraph, like a spanning tree, an arborescence, a single-source +shortest path tree, a perfect matching etc., with constraints on the number of +edges of each color. Some of these problems, like color-constrained spanning +tree, have elegant solutions and some of them, like color-constrained perfect +matching, are longstanding open questions. In this work, we study +color-constrained arborescences and shortest path trees. Computing a +color-constrained shortest path tree on weighted digraphs turns out to be +NP-hard in general but polynomial-time solvable when all cycles have positive +weight. This polynomial-time solvability is due to the fact that the solution +space is essentially the set of all color-constrained arborescences of a +directed acyclic subgraph of the original graph. While finding +color-constrained arborescence of digraphs is NP-hard in general, we give +efficient algorithms when the input graph is acyclic. Consequently, a +color-constrained shortest path tree on weighted digraphs having only positive +weight cycles can be efficiently computed. Our algorithms also generalize to +the problem of finding a color-constrained shortest path tree with minimum +total weight. En route, we sight nice connections to colored matroids and +color-constrained bases. + +
+
+ comment: Major revision, solving a more generalized problem +
+
+
+
+
+ + ♻ ☆ Average-case deterministic query complexity of boolean functions with + fixed weight + + +
+ We explore the $\textit{average-case deterministic query complexity}$ of +boolean functions under a $\textit{uniform distribution}$, denoted by +$\mathrm{D}_\mathrm{ave}(f)$, the minimum average depth of zero-error decision +tree computing a boolean function $f$. This measure has found several +applications across diverse fields, yet its understanding is limited. We study +$\mathrm{D}_\mathrm{ave}(f)$ of several functions, including the penalty +shoot-out function, symmetric functions, linear threshold functions and the +tribes functions. We prove $\mathrm{D}_\mathrm{ave}(f) \le \max \{ \log +\frac{\mathrm{wt}(f)}{\log n} + O(\log \log \frac{\mathrm{wt}(f)}{\log n}), +O(1) \}$ for every $n$-variable boolean function $f$, where $\mathrm{wt}(f)$ +denotes the weight (the number of inputs on which $f$ outputs $1$). For any +$4\log n \le m(n) \le 2^{n-1}$, we prove the upper bound is tight up to an +additive logarithmic term for almost all $n$-variable boolean functions with +weight $\mathrm{wt}(f) = m(n)$. Using H\r{a}stad's switching lemma or Rossman's +switching lemma [Comput. Complexity Conf. 137, 2019], one can derive +$\mathrm{D}_\mathrm{ave}(f) \leq n(1 - \frac{1}{O(w)})$ or +$\mathrm{D}_\mathrm{ave}(f) \le n(1 - \frac{1}{O(\log s)})$ for CNF/DNF +formulas of width $w$ or size $s$, respectively. We show that, for any $w \ge +\log n + \log \log n + 3$, there exists a DNF formula of width $w$ and size +$\lceil 2^w / w \rceil$ such that $\mathrm{D}_\mathrm{ave}(f) = n (1 - +\frac{\log n}{\Theta(w)})$. In other words, we show the criticality upper +bounds $O(w)$ and $O(\log s)$ are tight up to a multiplicative $\log n$ factor, +providing evidence on the tightness of the switching lemmas. + +
+
+
+
+
+ + ♻ ☆ Equality cases of the Alexandrov--Fenchel inequality are not in the + polynomial hierarchy + + +
+ Describing the equality conditions of the Alexandrov--Fenchel inequality has +been a major open problem for decades. We prove that in the case of convex +polytopes, this description is not in the polynomial hierarchy unless the +polynomial hierarchy collapses to a finite level. This is the first hardness +result for the problem, and is a complexity counterpart of the recent result by +Shenfeld and van Handel (arXiv:archive/201104059), which gave a geometric +characterization of the equality conditions. The proof involves Stanley's order +polytopes and employs poset theoretic technology. + +
+
+ comment: 35 pages. Fixed some typos and updated some references. to appear in + Forum Math. Pi +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 1 + +
+
+
+ + ♻ ☆ Linear Matching of JavaScript Regular Expressions + + +
+ Modern regex languages have strayed far from well-understood traditional +regular expressions: they include features that fundamentally transform the +matching problem. In exchange for these features, modern regex engines at times +suffer from exponential complexity blowups, a frequent source of +denial-of-service vulnerabilities in JavaScript applications. Worse, regex +semantics differ across languages, and the impact of these divergences on +algorithmic design and worst-case matching complexity has seldom been +investigated. + This paper provides a novel perspective on JavaScript's regex semantics by +identifying a larger-than-previously-understood subset of the language that can +be matched with linear time guarantees. In the process, we discover several +cases where state-of-the-art algorithms were either wrong (semantically +incorrect), inefficient (suffering from superlinear complexity) or excessively +restrictive (assuming certain features could not be matched linearly). We +introduce novel algorithms to restore correctness and linear complexity. We +further advance the state-of-the-art in linear regex matching by presenting the +first nonbacktracking algorithms for matching lookarounds in linear time: one +supporting captureless lookbehinds in any regex language, and another +leveraging a JavaScript property to support unrestricted lookaheads and +lookbehinds. Finally, we describe new time and space complexity tradeoffs for +regex engines. All of our algorithms are practical: we validated them in a +prototype implementation, and some have also been merged in the V8 JavaScript +implementation used in Chrome and Node.js. + +
+
+
+
+
+
+
+
+ + Logic in Computer Science 8 + +
+
+
+ + ☆ Gödel logics: Prenex fragments + + +
+ In this paper, we provide a complete classification for the first-order +G\"odel logics concerning the property that the formulas admit logically +equivalent prenex normal forms. We show that the only first-order G\"odel +logics that admit such prenex forms are those with finite truth value sets +since they allow all quantifier-shift rules and the logic \(G_\uparrow\) with +only one accumulation point at $1$ in the infinite truth values set. In all the +other cases, there are generally no logically equivalent prenex normal forms. +We will also see that \(G_\uparrow\) is the intersection of all finite +first-order G\"odel logics.\\ The second part of this paper investigates the +existence of effective equivalence between the validity of a formula and the +validity of some prenex normal form. The existence of such a normal form is +obvious for finite valued G\"odel logic and \(G_\uparrow\). G\"odel logics with +an uncountable truth value set admit the prenex normal forms if and only if +every surrounding of \(0\) is uncountable or \(0\) is an isolated point. +Otherwise, uncountable G\"odel logics are not recursively enumerable, however, +the prenex fragment is always recursively enumerable. Therefore, there is no +effective translation between the valid formula and the valid prenex normal +form. However, the existence of effectively constructible validity equivalent +prenex forms for the countable case is still up for debate. + +
+
+ comment: Research supported by FWF grant P 36571 +
+
+
+
+
+ + ☆ Efficient Discovery of Actual Causality using Abstraction-Refinement + + +
+ Causality is an influence by which one event contributes to the production of +another event, where the cause is partly responsible for the effect, and the +effect is partly dependent on the cause. In this paper, we propose a novel and +effective method to formally reason about the causal effect of events in +engineered systems, with application on finding the root-cause of safety +violations in embedded and cyber-physical systems. We are motivated by the +notion of actual causality by Halpern and Pearl, which focuses on the causal +effect of particular events, rather than type-level causality, which attempts +to make general statements about scientific and natural phenomena. Our first +contribution is formulating discovery of actual causality in computing systems +modeled by a transition systems as an SMT solving problem. Since datasets for +causality analysis tend to be large, in order to tackle the scalability problem +of automated formal reasoning, our second contribution is a novel technique +based on abstraction-refinement that allows identifying actual causes within +smaller abstract causal models. We demonstrate the effectiveness of our +approach (by several orders of magnitude) using three case studies to find the +actual cause of violations of safety in (1) a neural network controller for an +mountain car, (2) a controller for a lunar lander obtained by reinforcement +learning, and (3) an MPC controller for an F-16 autopilot simulator. + +
+
+
+
+
+ + ☆ A Logic for Veracity: Development and Implementation + + +
+ In the business rules of supply chains, there are concerns around trust, +truth, demonstrability and authenticity. These concerns are gathered together +under the name ``veracity". + In the work for this paper we were originally motivated by the requirement +around organic certification in the wine industry in New Zealand, but veracity +arises in many different situations and our formalisation shows how formal +methods can give insights into many such practical problems. + One activity for formal methods involves taking informal processes and +formalising them and subsequently building tools to support this formalisation +and therefore the original processes too, and the work reported here is an +example of that. + Here, then, we explore the idea of veracity in this spirit, give highlights +of the development of a logic for it and show how that logic can be implemented +in Coq, both for proof support and automation. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2302.06164 +
+
+
+
+
+ + ☆ Path-optimal symbolic execution of heap-manipulating programs + + +
+ Symbolic execution is at the core of many techniques for program analysis and +test generation. Traditional symbolic execution of programs with numeric inputs +enjoys the property of forking as many analysis traces as the number of +analyzed program paths, a property that in this paper we refer to as path +optimality. On the contrary, current approaches for symbolic execution of +heap-manipulating programs fail to satisfy this property, thereby incurring +heavy path explosion effects that crucially penalize the efficiency of the +analysis. This paper introduces POSE, path-optimal symbolic execution, a +symbolic execution algorithm that originally accomplishes path optimality +against heap-manipulating programs. We formalize the POSE algorithm for a tiny, +but representative object-oriented programming language, and implement the +formalization into a prototype symbolic executor to experiment the algorithm +against a benchmark of sample programs that take data structures as inputs. Our +experiments provide initial empirical evidence of the potential of POSE for +improving on the state of the art of symbolic execution of heap-manipulating +programs. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ An Algorithm for Reversible Logic Circuit Synthesis Based on Tensor + Decomposition + + +
+ An algorithm for reversible logic synthesis is proposed. The task is, for a +given $n$-bit substitution map $P_n: \{0,1\}^n \rightarrow \{0,1\}^n$, to find +a sequence of reversible logic gates that implements the map. The gate library +adopted in this work consists of multiple-controlled Toffoli gates denoted by +$C^m\!X$, where $m$ is the number of control bits that ranges from 0 to $n-1$. +Controlled gates with large $m \,\,(>2)$ are then further decomposed into +$C^0\!X$, $C^1\!X$, and $C^2\!X$ gates. A primary concern in designing the +algorithm is to reduce the use of $C^2\!X$ gate (also known as Toffoli gate) +which is known to be universal. + The main idea is to view an $n$-bit substitution map as a rank-$2n$ tensor +and to transform it such that the resulting map can be written as a tensor +product of a rank-($2n-2$) tensor and the $2\times 2$ identity matrix. Let +$\mathcal{P}_n$ be a set of all $n$-bit substitution maps. What we try to find +is a size reduction map $\mathcal{A}_{\rm red}: \mathcal{P}_n \rightarrow +\{P_n: P_n = P_{n-1} \otimes I_2\}$. %, where $I_m$ is the $m\times m$ identity +matrix. One can see that the output $P_{n-1} \otimes I_2$ acts nontrivially on +$n-1$ bits only, meaning that the map to be synthesized becomes $P_{n-1}$. The +size reduction process is iteratively applied until it reaches tensor product +of only $2 \times 2$ matrices. + +
+
+
+
+
+ + ♻ ☆ Behavioural Metrics: Compositionality of the Kantorovich Lifting and an + Application to Up-To Techniques + + +
+ Behavioural distances of transition systems modelled via coalgebras for +endofunctors generalize traditional notions of behavioural equivalence to a +quantitative setting, in which states are equipped with a measure of how +(dis)similar they are. Endowing transition systems with such distances +essentially relies on the ability to lift functors describing the one-step +behavior of the transition systems to the category of pseudometric spaces. We +consider the category theoretic generalization of the Kantorovich lifting from +transportation theory to the case of lifting functors to quantale-valued +relations, which subsumes equivalences, preorders and (directed) metrics. We +use tools from fibred category theory, which allow one to see the Kantorovich +lifting as arising from an appropriate fibred adjunction. Our main +contributions are compositionality results for the Kantorovich lifting, where +we show that that the lifting of a composed functor coincides with the +composition of the liftings. In addition, we describe how to lift distributive +laws in the case where one of the two functors is polynomial (with finite +coproducts). These results are essential ingredients for adapting +up-to-techniques to the case of quantale-valued behavioural distances. Up-to +techniques are a well-known coinductive technique for efficiently showing lower +bounds for behavioural distances. We illustrate the results of our paper in two +case studies. + +
+
+
+
+
+ + ♻ ☆ Typed compositional quantum computation with lenses + + +
+ We propose a type-theoretic framework for describing and proving properties +of quantum computations, in particular those presented as quantum circuits. Our +proposal is based on an observation that, in the polymorphic type system of +Coq, currying on quantum states allows us to apply quantum gates directly +inside a complex circuit. By introducing a discrete notion of lens to control +this currying, we are further able to separate the combinatorics of the circuit +structure from the computational content of gates. We apply our development to +define quantum circuits recursively from the bottom up, and prove their +correctness compositionally. + +
+
+
+
+
+ + ♻ ☆ Sound and Complete Proof Rules for Probabilistic Termination + + +
+ Deciding termination is a fundamental problem in the analysis of +probabilistic imperative programs. We consider the qualitative and quantitative +probabilistic termination problems for an imperative programming model with +discrete probabilistic choice and demonic bounded nondeterminism. The +qualitative question asks if the program terminates almost-surely, no matter +how nondeterminism is resolved. The quantitative question asks for a bound on +the probability of termination. Despite a long and rich literature on the +topic, no sound and relatively complete proof systems were known for these +problems. In this paper, we provide such sound and relatively complete proof +rules for proving qualitative and quantitative termination in the assertion +language of arithmetic. Our rules use supermartingales as estimates of the +likelihood of a program's evolution and variants as measures of distances to +termination. Our key insight is our completeness result, which shows how to +construct a suitable supermartingales from an almost-surely terminating +program. We also show that proofs of termination in many existing proof systems +can be transformed to proofs in our system, pointing to its applicability in +practice. As an application of our proof rule, we show an explicit proof of +almost-sure termination for the two-dimensional random walker. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Hardware Architecturea 9 + +
+
+
+ + ☆ The Bicameral Cache: a split cache for vector architectures + + +
+ The Bicameral Cache is a cache organization proposal for a vector +architecture that segregates data according to their access type, +distinguishing scalar from vector references. Its aim is to avoid both types of +references from interfering in each other's data locality, with a special focus +on prioritizing the performance on vector references. The proposed system +incorporates an additional, non-polluting prefetching mechanism to help +populate the long vector cache lines in advance to increase the hit rate by +further exploiting the spatial locality on vector data. Its evaluation was +conducted on the Cavatools simulator, comparing the performance to a standard +conventional cache, over different typical vector benchmarks for several vector +lengths. The results proved the proposed cache speeds up performance on +stride-1 vector benchmarks, while hardly impacting non-stride-1's. In addition, +the prefetching feature consistently provided an additional value. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Customized Retrieval Augmented Generation and Benchmarking for EDA Tool + Documentation QA + + +
+ Retrieval augmented generation (RAG) enhances the accuracy and reliability of +generative AI models by sourcing factual information from external databases, +which is extensively employed in document-grounded question-answering (QA) +tasks. Off-the-shelf RAG flows are well pretrained on general-purpose +documents, yet they encounter significant challenges when being applied to +knowledge-intensive vertical domains, such as electronic design automation +(EDA). This paper addresses such issue by proposing a customized RAG framework +along with three domain-specific techniques for EDA tool documentation QA, +including a contrastive learning scheme for text embedding model fine-tuning, a +reranker distilled from proprietary LLM, and a generative LLM fine-tuned with +high-quality domain corpus. Furthermore, we have developed and released a +documentation QA evaluation benchmark, ORD-QA, for OpenROAD, an advanced +RTL-to-GDSII design platform. Experimental results demonstrate that our +proposed RAG flow and techniques have achieved superior performance on ORD-QA +as well as on a commercial tool, compared with state-of-the-arts. The ORD-QA +benchmark and the training dataset for our customized RAG flow are open-source +at https://github.com/lesliepy99/RAG-EDA. + +
+
+
+
+
+ + ☆ MINT: Securely Mitigating Rowhammer with a Minimalist In-DRAM Tracker + + +
+ This paper investigates secure low-cost in-DRAM trackers for mitigating +Rowhammer (RH). In-DRAM solutions have the advantage that they can solve the RH +problem within the DRAM chip, without relying on other parts of the system. +However, in-DRAM mitigation suffers from two key challenges: First, the +mitigations are synchronized with refresh, which means we cannot mitigate at +arbitrary times. Second, the SRAM area available for aggressor tracking is +severely limited, to only a few bytes. Existing low-cost in-DRAM trackers (such +as TRR) have been broken by well-crafted access patterns, whereas prior +counter-based schemes require impractical overheads of hundreds or thousands of +entries per bank. The goal of our paper is to develop an ultra low-cost secure +in-DRAM tracker. + Our solution is based on a simple observation: if only one row can be +mitigated at refresh, then we should ideally need to track only one row. We +propose a Minimalist In-DRAM Tracker (MINT), which provides secure mitigation +with just a single entry. At each refresh, MINT probabilistically decides which +activation in the upcoming interval will be selected for mitigation at the next +refresh. MINT provides guaranteed protection against classic single and +double-sided attacks. We also derive the minimum RH threshold (MinTRH) +tolerated by MINT across all patterns. MINT has a MinTRH of 1482 which can be +lowered to 356 with RFM. The MinTRH of MINT is lower than a prior counter-based +design with 677 entries per bank, and is within 2x of the MinTRH of an +idealized design that stores one-counter-per-row. We also analyze the impact of +refresh postponement on the MinTRH of low-cost in-DRAM trackers, and propose an +efficient solution to make such trackers compatible with refresh postponement. + +
+
+ comment: 13 pages including appendix +
+
+
+
+
+ + ☆ KWT-Tiny: RISC-V Accelerated, Embedded Keyword Spotting Transformer + + +
+ This paper explores the adaptation of Transformerbased models for edge +devices through the quantisation and hardware acceleration of the ARM Keyword +Transformer (KWT) model on a RISC-V platform. The model was targeted to run on +64kB RAM in bare-metal C using a custom-developed edge AI library. KWT-1 was +retrained to be 369 times smaller, with only a 10% loss in accuracy through +reducing output classes from 35 to 2. The retraining and quantisation reduced +model size from 2.42 MB to 1.65 kB. The integration of custom RISC-V +instructions that accelerated GELU and SoftMax operations enabled a 5x speedup +and thus ~5x power reduction in inference, with inference clock cycle counts +decreasing from 26 million to 5.5 million clock cycles while incurring a small +area overhead of approximately 29%. The results demonstrate a viable method for +porting and accelerating Transformer-based models in low-power IoT devices. + +
+
+ comment: 6 pages, 7 figures, accepted to be published in the IEEE SOCC 2024 + conference +
+
+
+
+
+ + ☆ ImPress: Securing DRAM Against Data-Disturbance Errors via Implicit + Row-Press Mitigation + + +
+ DRAM cells are susceptible to Data-Disturbance Errors (DDE), which can be +exploited by an attacker to compromise system security. Rowhammer is a +well-known DDE vulnerability that occurs when a row is repeatedly activated. +Rowhammer can be mitigated by tracking aggressor rows inside DRAM (in-DRAM) or +at the Memory Controller (MC). Row-Press (RP) is a new DDE vulnerability that +occurs when a row is kept open for a long time. RP significantly reduces the +number of activations required to induce an error, thus breaking existing RH +solutions. Prior work on Explicit Row-Press mitigation, ExPress, requires the +memory controller to limit the maximum row-open-time, and redesign existing +Rowhammer solutions with reduced Rowhammer threshold. Unfortunately, ExPress +incurs significant performance and storage overheads, and being a memory +controller-based solution, it is incompatible with in-DRAM trackers. In this +paper, we propose Implicit Row-Press mitigation (ImPress), which does not +restrict row-open-time, is compatible with memory controller-based and in-DRAM +solutions and does not reduce the tolerated Rowhammer threshold. ImPress treats +a row open for a specified time as equivalent to an activation. We design +ImPress by developing a Unified Charge-Loss Model, which combines the net +effect of both Rowhammer and Row-Press for arbitrary patterns. We analyze both +controller-based (Graphene and PARA) and in-DRAM trackers (Mithril and MINT). +We show that ImPress makes Rowhammer solutions resilient to Row-Press +transparently, without affecting the Rowhammer threshold. + +
+
+ comment: 12 page paper +
+
+
+
+
+ + ☆ AICircuit: A Multi-Level Dataset and Benchmark for AI-Driven Analog + Integrated Circuit Design + + +
+ Analog and radio-frequency circuit design requires extensive exploration of +both circuit topology and parameters to meet specific design criteria like +power consumption and bandwidth. Designers must review state-of-the-art +topology configurations in the literature and sweep various circuit parameters +within each configuration. This design process is highly specialized and +time-intensive, particularly as the number of circuit parameters increases and +the circuit becomes more complex. Prior research has explored the potential of +machine learning to enhance circuit design procedures. However, these studies +primarily focus on simple circuits, overlooking the more practical and complex +analog and radio-frequency systems. A major obstacle for bearing the power of +machine learning in circuit design is the availability of a generic and diverse +dataset, along with robust metrics, which are essential for thoroughly +evaluating and improving machine learning algorithms in the analog and +radio-frequency circuit domain. We present AICircuit, a comprehensive +multi-level dataset and benchmark for developing and evaluating ML algorithms +in analog and radio-frequency circuit design. AICircuit comprises seven +commonly used basic circuits and two complex wireless transceiver systems +composed of multiple circuit blocks, encompassing a wide array of design +scenarios encountered in real-world applications. We extensively evaluate +various ML algorithms on the dataset, revealing the potential of ML algorithms +in learning the mapping from the design specifications to the desired circuit +parameters. + +
+
+
+
+
+ + ♻ ☆ CHOSEN: Compilation to Hardware Optimization Stack for Efficient Vision + Transformer Inference + + +
+ Vision Transformers (ViTs) represent a groundbreaking shift in machine +learning approaches to computer vision. Unlike traditional approaches, ViTs +employ the self-attention mechanism, which has been widely used in natural +language processing, to analyze image patches. Despite their advantages in +modeling visual tasks, deploying ViTs on hardware platforms, notably +Field-Programmable Gate Arrays (FPGAs), introduces considerable challenges. +These challenges stem primarily from the non-linear calculations and high +computational and memory demands of ViTs. This paper introduces CHOSEN, a +software-hardware co-design framework to address these challenges and offer an +automated framework for ViT deployment on the FPGAs in order to maximize +performance. Our framework is built upon three fundamental contributions: +multi-kernel design to maximize the bandwidth, mainly targeting benefits of +multi DDR memory banks, approximate non-linear functions that exhibit minimal +accuracy degradation, and efficient use of available logic blocks on the FPGA, +and efficient compiler to maximize the performance and memory-efficiency of the +computing kernels by presenting a novel algorithm for design space exploration +to find optimal hardware configuration that achieves optimal throughput and +latency. Compared to the state-of-the-art ViT accelerators, CHOSEN achieves a +1.5x and 1.42x improvement in the throughput on the DeiT-S and DeiT-B models. + +
+
+
+
+
+ + ♻ ☆ ARCO:Adaptive Multi-Agent Reinforcement Learning-Based Hardware/Software + Co-Optimization Compiler for Improved Performance in DNN Accelerator Design + + +
+ This paper presents ARCO, an adaptive Multi-Agent Reinforcement Learning +(MARL)-based co-optimizing compilation framework designed to enhance the +efficiency of mapping machine learning (ML) models - such as Deep Neural +Networks (DNNs) - onto diverse hardware platforms. The framework incorporates +three specialized actor-critic agents within MARL, each dedicated to a distinct +aspect of compilation/optimization at an abstract level: one agent focuses on +hardware, while two agents focus on software optimizations. This integration +results in a collaborative hardware/software co-optimization strategy that +improves the precision and speed of DNN deployments. Concentrating on +high-confidence configurations simplifies the search space and delivers +superior performance compared to current optimization methods. The ARCO +framework surpasses existing leading frameworks, achieving a throughput +increase of up to 37.95% while reducing the optimization time by up to 42.2% +across various DNNs. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ A Mess of Memory System Benchmarking, Simulation and Application + Profiling MICRO-57 + + +
+ The Memory stress (Mess) framework provides a unified view of the memory +system benchmarking, simulation and application profiling. The Mess benchmark +provides a holistic and detailed memory system characterization. It is based on +hundreds of measurements that are represented as a family of bandwidth--latency +curves. The benchmark increases the coverage of all the previous tools and +leads to new findings in the behavior of the actual and simulated memory +systems. We deploy the Mess benchmark to characterize Intel, AMD, IBM, Fujitsu, +Amazon and NVIDIA servers with DDR4, DDR5, HBM2 and HBM2E memory. The Mess +analytical memory simulator uses bandwidth--latency concept for the memory +performance simulation. We integrate Mess with widely-used CPUs simulators +enabling modeling of all high-end memory technologies. The Mess simulator is +fast, easy to integrate and it closely matches the actual system performance. +By design, it enables a quick adoption of new memory technologies in hardware +simulators. Finally, the Mess application profiling positions the application +in the bandwidth--latency space of the target memory system. This information +can be correlated with other application runtime activities and the source +code, leading to a better overall understanding of the application's behavior. +The current Mess benchmark release covers all major CPU and GPU ISAs, x86, ARM, +Power, RISC-V, and NVIDIA's PTX. We also release as open source the ZSim, gem5 +and OpenPiton Metro-MPI integrated with the Mess simulator for DDR4, DDR5, +Optane, HBM2, HBM2E and CXL memory expanders. The Mess application profiling is +already integrated into a suite of production HPC performance analysis tools. + +
+
+ comment: 17 pages; just accepted in MICRO-57 +
+
+
+
+
+
+
+
+ + Logic in Computer Science 5 + +
+
+
+ + ☆ Algebraic anti-unification + + +
+ Abstraction is key to human and artificial intelligence as it allows one to +see common structure in otherwise distinct objects or situations and as such it +is a key element for generality in AI. Anti-unification (or generalization) is +\textit{the} part of theoretical computer science and AI studying abstraction. +It has been successfully applied to various AI-related problems, most +importantly inductive logic programming. Up to this date, anti-unification is +studied only from a syntactic perspective in the literature. The purpose of +this paper is to initiate an algebraic (i.e. semantic) theory of +anti-unification within general algebras. This is motivated by recent +applications to similarity and analogical proportions. + +
+
+
+
+
+ + ☆ Color Refinement for Relational Structures + + +
+ Color Refinement, also known as Naive Vertex Classification, is a classical +method to distinguish graphs by iteratively computing a coloring of their +vertices. While it is mainly used as an imperfect way to test for isomorphism, +the algorithm permeated many other, seemingly unrelated, areas of computer +science. The method is algorithmically simple, and it has a well-understood +distinguishing power: It is logically characterized by Cai, F\"urer and +Immerman (1992), who showed that it distinguishes precisely those graphs that +can be distinguished by a sentence of first-order logic with counting +quantifiers and only two variables. A combinatorial characterization is given +by Dvo\v{r}\'ak (2010), who shows that it distinguishes precisely those graphs +that can be distinguished by the number of homomorphisms from some tree. + In this paper, we introduce Relational Color Refinement (RCR, for short), a +generalization of the Color Refinement method from graphs to arbitrary +relational structures, whose distinguishing power admits the equivalent +combinatorial and logical characterizations as Color Refinement has on graphs: +We show that RCR distinguishes precisely those structures that can be +distinguished by the number of homomorphisms from an acyclic relational +structure. Further, we show that RCR distinguishes precisely those structures +that can be distinguished by a sentence of the guarded fragment of first-order +logic with counting quantifiers. + +
+
+
+
+
+ + ♻ ☆ Around Classical and Intuitionistic Linear Processes + + +
+ Curry-Howard correspondences between Linear Logic (LL) and session types +provide a firm foundation for concurrent processes. As the correspondences hold +for intuitionistic and classic versions of LL (ILL and CLL), we obtain two +different families of type systems for concurrency. An open question remains: +how do these two families exactly relate to each other? Based upon a +translation from CLL to ILL due to Laurent (2018), we provide two complementary +answers, in the form of full abstraction results based on a typed observational +equivalence due to Atkey (2017). Our results elucidate hitherto missing formal +links between seemingly related yet different type systems for concurrency. + +
+
+ comment: Full version, 19 pages + appendices +
+
+
+
+
+ + ♻ ☆ Coalgebraic Satisfiability Checking for Arithmetic $μ$-Calculi + + +
+ The coalgebraic $\mu$-calculus provides a generic semantic framework for +fixpoint logics over systems whose branching type goes beyond the standard +relational setup, e.g. probabilistic, weighted, or game-based. Previous work on +the coalgebraic $\mu$-calculus includes an exponential-time upper bound on +satisfiability checking, which however relies on the availability of tableau +rules for the next-step modalities that are sufficiently well-behaved in a +formally defined sense; in particular, rule matches need to be representable by +polynomial-sized codes, and the sequent duals of the rules need to absorb cut. +While such rule sets have been identified for some important cases, they are +not known to exist in all cases of interest, in particular ones involving +either integer weights as in the graded $\mu$-calculus, or real-valued weights +in combination with non-linear arithmetic. In the present work, we prove the +same upper complexity bound under more general assumptions, specifically +regarding the complexity of the (much simpler) satisfiability problem for the +underlying one-step logic, roughly described as the nesting-free next-step +fragment of the logic. The bound is realized by a generic global caching +algorithm that supports on-the-fly satisfiability checking. Notably, our +approach directly accommodates unguarded formulae, and thus avoids use of the +guardedness transformation. Example applications include new exponential-time +upper bounds for satisfiability checking in an extension of the graded +$\mu$-calculus with polynomial inequalities (including positive Presburger +arithmetic), as well as an extension of the (two-valued) probabilistic +$\mu$-calculus with polynomial inequalities. + +
+
+
+
+
+ + ♻ ☆ Game Comonads & Generalised Quantifiers + + +
+ Game comonads, introduced by Abramsky, Dawar and Wang and developed by +Abramsky and Shah, give an interesting categorical semantics to some +Spoiler-Duplicator games that are common in finite model theory. In particular +they expose connections between one-sided and two-sided games, and parameters +such as treewidth and treedepth and corresponding notions of decomposition. In +the present paper, we expand the realm of game comonads to logics with +generalised quantifiers. In particular, we introduce a comonad graded by two +parameters $n \leq k$ such that isomorphisms in the resulting Kleisli category +are exactly Duplicator winning strategies in Hella's $n$-bijection game with +$k$ pebbles. We define a one-sided version of this game which allows us to +provide a categorical semantics for a number of logics with generalised +quantifiers. We also give a novel notion of tree decomposition that emerges +from the construction. + +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 18 + +
+
+
+ + ☆ A simple and fast C++ thread pool implementation capable of running task + graphs + + +
+ In this paper, the author presents a simple and fast C++ thread pool +implementation capable of running task graphs. The implementation is publicly +available on GitHub, see https://github.com/dpuyda/scheduling. + +
+
+
+
+
+ + ☆ Parallel Split Learning with Global Sampling + + +
+ The expansion of IoT devices and the demands of Deep Learning have +highlighted significant challenges in Distributed Deep Learning (DDL) systems. +Parallel Split Learning (PSL) has emerged as a promising derivative of Split +Learning that is well suited for distributed learning on resource-constrained +devices. However, PSL faces several obstacles, such as large effective batch +sizes, non-IID data distributions, and the straggler effect. We view these +issues as a sampling dilemma and propose to address them by orchestrating the +mini-batch sampling process on the server side. We introduce the Uniform Global +Sampling (UGS) method to decouple the effective batch size from the number of +clients and reduce mini-batch deviation in non-IID settings. To address the +straggler effect, we introduce the Latent Dirichlet Sampling (LDS) method, +which generalizes UGS to balance the trade-off between batch deviation and +training time. Our simulations reveal that our proposed methods enhance model +accuracy by up to 34.1% in non-IID settings and reduce the training time in the +presence of stragglers by up to 62%. In particular, LDS effectively mitigates +the straggler effect without compromising model accuracy or adding significant +computational overhead compared to UGS. Our results demonstrate the potential +of our methods as a promising solution for DDL in real applications. + +
+
+
+
+
+ + ☆ CrashEventLLM: Predicting System Crashes with Large Language Models + + +
+ As the dependence on computer systems expands across various domains, +focusing on personal, industrial, and large-scale applications, there arises a +compelling need to enhance their reliability to sustain business operations +seamlessly and ensure optimal user satisfaction. System logs generated by these +devices serve as valuable repositories of historical trends and past failures. +The use of machine learning techniques for failure prediction has become +commonplace, enabling the extraction of insights from past data to anticipate +future behavior patterns. Recently, large language models have demonstrated +remarkable capabilities in tasks including summarization, reasoning, and event +prediction. Therefore, in this paper, we endeavor to investigate the potential +of large language models in predicting system failures, leveraging insights +learned from past failure behavior to inform reasoning and decision-making +processes effectively. Our approach involves leveraging data from the Intel +Computing Improvement Program (ICIP) system crash logs to identify significant +events and develop CrashEventLLM. This model, built upon a large language model +framework, serves as our foundation for crash event prediction. Specifically, +our model utilizes historical data to forecast future crash events, informed by +expert annotations. Additionally, it goes beyond mere prediction, offering +insights into potential causes for each crash event. This work provides the +preliminary insights into prompt-based large language models for the log-based +event prediction task. + +
+
+ comment: Accepted in ICITCOM'24. Copyrights will be with IEEE +
+
+
+
+
+ + ☆ vTensor: Flexible Virtual Tensor Management for Efficient LLM Serving + + +
+ Large Language Models (LLMs) are widely used across various domains, +processing millions of daily requests. This surge in demand poses significant +challenges in optimizing throughput and latency while keeping costs manageable. +The Key-Value (KV) cache, a standard method for retaining previous +computations, makes LLM inference highly bounded by memory. While batching +strategies can enhance performance, they frequently lead to significant memory +fragmentation. Even though cutting-edge systems like vLLM mitigate KV cache +fragmentation using paged Attention mechanisms, they still suffer from +inefficient memory and computational operations due to the tightly coupled page +management and computation kernels. + This study introduces the vTensor, an innovative tensor structure for LLM +inference based on GPU virtual memory management (VMM). vTensor addresses +existing limitations by decoupling computation from memory defragmentation and +offering dynamic extensibility. Our framework employs a CPU-GPU heterogeneous +approach, ensuring efficient, fragmentation-free memory management while +accommodating various computation kernels across different LLM architectures. +Experimental results indicate that vTensor achieves an average speedup of 1.86x +across different models, with up to 2.42x in multi-turn chat scenarios. +Additionally, vTensor provides average speedups of 2.12x and 3.15x in kernel +evaluation, reaching up to 3.92x and 3.27x compared to SGLang Triton +prefix-prefilling kernels and vLLM paged Attention kernel, respectively. +Furthermore, it frees approximately 71.25% (57GB) of memory on the NVIDIA A100 +GPU compared to vLLM, enabling more memory-intensive workloads. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ☆ A New Theoretical Perspective on Data Heterogeneity in Federated + Optimization + + +
+ In federated learning (FL), data heterogeneity is the main reason that +existing theoretical analyses are pessimistic about the convergence rate. In +particular, for many FL algorithms, the convergence rate grows dramatically +when the number of local updates becomes large, especially when the product of +the gradient divergence and local Lipschitz constant is large. However, +empirical studies can show that more local updates can improve the convergence +rate even when these two parameters are large, which is inconsistent with the +theoretical findings. This paper aims to bridge this gap between theoretical +understanding and practical performance by providing a theoretical analysis +from a new perspective on data heterogeneity. In particular, we propose a new +and weaker assumption compared to the local Lipschitz gradient assumption, +named the heterogeneity-driven pseudo-Lipschitz assumption. We show that this +and the gradient divergence assumptions can jointly characterize the effect of +data heterogeneity. By deriving a convergence upper bound for FedAvg and its +extensions, we show that, compared to the existing works, local Lipschitz +constant is replaced by the much smaller heterogeneity-driven pseudo-Lipschitz +constant and the corresponding convergence upper bound can be significantly +reduced for the same number of local updates, although its order stays the +same. In addition, when the local objective function is quadratic, more +insights on the impact of data heterogeneity can be obtained using the +heterogeneity-driven pseudo-Lipschitz constant. For example, we can identify a +region where FedAvg can outperform mini-batch SGD even when the gradient +divergence can be arbitrarily large. Our findings are validated using +experiments. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ☆ The Diversity Bonus: Learning from Dissimilar Distributed Clients in + Personalized Federated Learning + + +
+ Personalized Federated Learning (PFL) is a commonly used framework that +allows clients to collaboratively train their personalized models. PFL is +particularly useful for handling situations where data from different clients +are not independent and identically distributed (non-IID). Previous research in +PFL implicitly assumes that clients can gain more benefits from those with +similar data distributions. Correspondingly, methods such as personalized +weight aggregation are developed to assign higher weights to similar clients +during training. We pose a question: can a client benefit from other clients +with dissimilar data distributions and if so, how? This question is +particularly relevant in scenarios with a high degree of non-IID, where clients +have widely different data distributions, and learning from only similar +clients will lose knowledge from many other clients. We note that when dealing +with clients with similar data distributions, methods such as personalized +weight aggregation tend to enforce their models to be close in the parameter +space. It is reasonable to conjecture that a client can benefit from dissimilar +clients if we allow their models to depart from each other. Based on this idea, +we propose DiversiFed which allows each client to learn from clients with +diversified data distribution in personalized federated learning. DiversiFed +pushes personalized models of clients with dissimilar data distributions apart +in the parameter space while pulling together those with similar distributions. +In addition, to achieve the above effect without using prior knowledge of data +distribution, we design a loss function that leverages the model similarity to +determine the degree of attraction and repulsion between any two models. +Experiments on several datasets show that DiversiFed can benefit from +dissimilar clients and thus outperform the state-of-the-art methods. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ GraphScale: A Framework to Enable Machine Learning over Billion-node + Graphs + + +
+ Graph Neural Networks (GNNs) have emerged as powerful tools for supervised +machine learning over graph-structured data, while sampling-based node +representation learning is widely utilized in unsupervised learning. However, +scalability remains a major challenge in both supervised and unsupervised +learning for large graphs (e.g., those with over 1 billion nodes). The +scalability bottleneck largely stems from the mini-batch sampling phase in GNNs +and the random walk sampling phase in unsupervised methods. These processes +often require storing features or embeddings in memory. In the context of +distributed training, they require frequent, inefficient random access to data +stored across different workers. Such repeated inter-worker communication for +each mini-batch leads to high communication overhead and computational +inefficiency. + We propose GraphScale, a unified framework for both supervised and +unsupervised learning to store and process large graph data distributedly. The +key insight in our design is the separation of workers who store data and those +who perform the training. This separation allows us to decouple computing and +storage in graph training, thus effectively building a pipeline where data +fetching and data computation can overlap asynchronously. Our experiments show +that GraphScale outperforms state-of-the-art methods for distributed training +of both GNNs and node embeddings. We evaluate GraphScale both on public and +proprietary graph datasets and observe a reduction of at least 40% in +end-to-end training times compared to popular distributed frameworks, without +any loss in performance. While most existing methods don't support billion-node +graphs for training node embeddings, GraphScale is currently deployed in +production at TikTok enabling efficient learning over such large graphs. + +
+
+ comment: Published in the Proceedings of the 33rd ACM International Conference + on Information and Knowledge Management (CIKM 2024), 8 Pages, 12 Figures +
+
+
+
+
+ + ☆ Automated Road Safety: Enhancing Sign and Surface Damage Detection with + AI + + +
+ Public transportation plays a crucial role in our lives, and the road network +is a vital component in the implementation of smart cities. Recent advancements +in AI have enabled the development of advanced monitoring systems capable of +detecting anomalies in road surfaces and road signs, which, if unaddressed, can +lead to serious road accidents. This paper presents an innovative approach to +enhance road safety through the detection and classification of traffic signs +and road surface damage using advanced deep learning techniques. This +integrated approach supports proactive maintenance strategies, improving road +safety and resource allocation for the Molise region and the city of +Campobasso. The resulting system, developed as part of the Casa delle +Tecnologie Emergenti (House of Emergent Technologies) Molise (Molise CTE) +research project funded by the Italian Minister of Economic Growth (MIMIT), +leverages cutting-edge technologies such as Cloud Computing and High +Performance Computing with GPU utilization. It serves as a valuable tool for +municipalities, enabling quick detection of anomalies and the prompt +organization of maintenance operations + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ Tackling Selfish Clients in Federated Learning + + +
+ Federated Learning (FL) is a distributed machine learning paradigm +facilitating participants to collaboratively train a model without revealing +their local data. However, when FL is deployed into the wild, some intelligent +clients can deliberately deviate from the standard training process to make the +global model inclined toward their local model, thereby prioritizing their +local data distribution. We refer to this novel category of misbehaving clients +as selfish. In this paper, we propose a Robust aggregation strategy for FL +server to mitigate the effect of Selfishness (in short RFL-Self). RFL-Self +incorporates an innovative method to recover (or estimate) the true updates of +selfish clients from the received ones, leveraging robust statistics (median of +norms) of the updates at every round. By including the recovered updates in +aggregation, our strategy offers strong robustness against selfishness. Our +experimental results, obtained on MNIST and CIFAR-10 datasets, demonstrate that +just 2% of clients behaving selfishly can decrease the accuracy by up to 36%, +and RFL-Self can mitigate that effect without degrading the global model +performance. + +
+
+ comment: 10 pages, 16 figures. European Conference on Artificial Intelligence + (ECAI) 2024 +
+
+
+
+
+ + ☆ Poisoning with A Pill: Circumventing Detection in Federated Learning + + +
+ Without direct access to the client's data, federated learning (FL) is +well-known for its unique strength in data privacy protection among existing +distributed machine learning techniques. However, its distributive and +iterative nature makes FL inherently vulnerable to various poisoning attacks. +To counteract these threats, extensive defenses have been proposed to filter +out malicious clients, using various detection metrics. Based on our analysis +of existing attacks and defenses, we find that there is a lack of attention to +model redundancy. In neural networks, various model parameters contribute +differently to the model's performance. However, existing attacks in FL +manipulate all the model update parameters with the same strategy, making them +easily detectable by common defenses. Meanwhile, the defenses also tend to +analyze the overall statistical features of the entire model updates, leaving +room for sophisticated attacks. Based on these observations, this paper +proposes a generic and attack-agnostic augmentation approach designed to +enhance the effectiveness and stealthiness of existing FL poisoning attacks +against detection in FL, pointing out the inherent flaws of existing defenses +and exposing the necessity of fine-grained FL security. Specifically, we employ +a three-stage methodology that strategically constructs, generates, and injects +poison (generated by existing attacks) into a pill (a tiny subnet with a novel +structure) during the FL training, named as pill construction, pill poisoning, +and pill injection accordingly. Extensive experimental results show that FL +poisoning attacks enhanced by our method can bypass all the popular defenses, +and can gain an up to 7x error rate increase, as well as on average a more than +2x error rate increase on both IID and non-IID data, in both cross-silo and +cross-device FL systems. + +
+
+
+
+
+ + ☆ Versioned Analysis of Software Quality Indicators and Self-admitted + Technical Debt in Ethereum Smart Contracts with Ethstractor + + +
+ The rise of decentralized applications (dApps) has made smart contracts +imperative components of blockchain technology. As many smart contracts process +financial transactions, their security is paramount. Moreover, the immutability +of blockchains makes vulnerabilities in smart contracts particularly +challenging because it requires deploying a new version of the contract at a +different address, incurring substantial fees paid in Ether. This paper +proposes Ethstractor, the first smart contract collection tool for gathering a +dataset of versioned smart contracts. The collected dataset is then used to +evaluate the reliability of code metrics as indicators of vulnerabilities in +smart contracts. Our findings indicate that code metrics are ineffective in +signalling the presence of vulnerabilities. Furthermore, we investigate whether +vulnerabilities in newer versions of smart contracts are mitigated and identify +that the number of vulnerabilities remains consistent over time. Finally, we +examine the removal of self-admitted technical debt in contracts and uncover +that most of the introduced debt has never been subsequently removed. + +
+
+ comment: Copyright 2024 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ Mélange: Cost Efficient Large Language Model Serving by Exploiting GPU + Heterogeneity + + +
+ Large language models (LLMs) are increasingly integrated into many online +services, yet they remain cost-prohibitive to deploy due to the requirement of +expensive GPU instances. Prior work has addressed the high cost of LLM serving +by improving the inference engine, but less attention has been given to +selecting the most cost-efficient GPU type(s) for a specific LLM service. There +is a large and growing landscape of GPU types and, within these options, higher +cost does not always lead to increased performance. Instead, through a +comprehensive investigation, we find that three key LLM service characteristics +(request size, request rate, SLO) strongly influence GPU cost efficiency, and +differing GPU types are most cost efficient for differing LLM service settings. +As a result, the most cost-efficient allocation for a given service is +typically a mix of heterogeneous GPU types. Based on this analysis, we +introduce M\'elange, a GPU allocation framework that navigates these diverse +LLM service characteristics and heterogeneous GPU option space to automatically +and efficiently derive the minimal-cost GPU allocation for a given LLM service. +We formulate the GPU allocation task as a cost-aware bin packing problem where +GPUs are bins and items are slices of the service workload. Our formulation's +constraints account for a service's unique characteristics, allowing M\'elange +to be flexible to support diverse service settings and heterogeneity-aware to +adapt the GPU allocation to a specific service. Compared to using only a single +GPU type, M\'elange reduces deployment costs by up to 77% in conversational +settings, 33% in document-based settings, and 51% in a mixed setting. + +
+
+
+
+
+ + ♻ ☆ DP-DyLoRA: Fine-Tuning Transformer-Based Models On-Device under + Differentially Private Federated Learning using Dynamic Low-Rank Adaptation + + +
+ Federated learning (FL) allows clients to collaboratively train a global +model without sharing their local data with a server. However, clients' +contributions to the server can still leak sensitive information. Differential +privacy (DP) addresses such leakage by providing formal privacy guarantees, +with mechanisms that add randomness to the clients' contributions. The +randomness makes it infeasible to train large transformer-based models, common +in modern federated learning systems. In this work, we empirically evaluate the +practicality of fine-tuning large scale on-device transformer-based models with +differential privacy in a federated learning system. We conduct comprehensive +experiments on various system properties for tasks spanning a multitude of +domains: speech recognition, computer vision (CV) and natural language +understanding (NLU). Our results show that full fine-tuning under +differentially private federated learning (DP-FL) generally leads to huge +performance degradation which can be alleviated by reducing the dimensionality +of contributions through parameter-efficient fine-tuning (PEFT). Our benchmarks +of existing DP-PEFT methods show that DP-Low-Rank Adaptation (DP-LoRA) +consistently outperforms other methods. An even more promising approach, +DyLoRA, which makes the low rank variable, when naively combined with FL would +straightforwardly break differential privacy. We therefore propose an +adaptation method that can be combined with differential privacy and call it +DP-DyLoRA. Finally, we are able to reduce the accuracy degradation and word +error rate (WER) increase due to DP to less than 2% and 7% respectively with 1 +million clients and a stringent privacy budget of $\epsilon=2$. + +
+
+ comment: 16 pages, 10 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ SemiSFL: Split Federated Learning on Unlabeled and Non-IID Data + + +
+ Federated Learning (FL) has emerged to allow multiple clients to +collaboratively train machine learning models on their private data at the +network edge. However, training and deploying large-scale models on +resource-constrained devices is challenging. Fortunately, Split Federated +Learning (SFL) offers a feasible solution by alleviating the computation and/or +communication burden on clients. However, existing SFL works often assume +sufficient labeled data on clients, which is usually impractical. Besides, data +non-IIDness poses another challenge to ensure efficient model training. To our +best knowledge, the above two issues have not been simultaneously addressed in +SFL. Herein, we propose a novel Semi-supervised SFL system, termed SemiSFL, +which incorporates clustering regularization to perform SFL with unlabeled and +non-IID client data. Moreover, our theoretical and experimental investigations +into model convergence reveal that the inconsistent training processes on +labeled and unlabeled data have an influence on the effectiveness of clustering +regularization. To mitigate the training inconsistency, we develop an algorithm +for dynamically adjusting the global updating frequency, so as to improve +training performance. Extensive experiments on benchmark models and datasets +show that our system provides a 3.8x speed-up in training time, reduces the +communication cost by about 70.3% while reaching the target accuracy, and +achieves up to 5.8% improvement in accuracy under non-IID scenarios compared to +the state-of-the-art baselines. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Supercharging Federated Learning with Flower and NVIDIA FLARE + + +
+ Several open-source systems, such as Flower and NVIDIA FLARE, have been +developed in recent years while focusing on different aspects of federated +learning (FL). Flower is dedicated to implementing a cohesive approach to FL, +analytics, and evaluation. Over time, Flower has cultivated extensive +strategies and algorithms tailored for FL application development, fostering a +vibrant FL community in research and industry. Conversely, FLARE has +prioritized the creation of an enterprise-ready, resilient runtime environment +explicitly designed for FL applications in production environments. In this +paper, we describe our initial integration of both frameworks and show how they +can work together to supercharge the FL ecosystem as a whole. Through the +seamless integration of Flower and FLARE, applications crafted within the +Flower framework can effortlessly operate within the FLARE runtime environment +without necessitating any modifications. This initial integration streamlines +the process, eliminating complexities and ensuring smooth interoperability +between the two platforms, thus enhancing the overall efficiency and +accessibility of FL applications. + +
+
+ comment: Added a figure comparing running a Flower application natively or + within FLARE +
+
+
+
+
+ + ♻ ☆ A Survey on Federated Analytics: Taxonomy, Enabling Techniques, + Applications and Open Issues + + +
+ The escalating influx of data generated by networked edge devices, coupled +with the growing awareness of data privacy, has restricted the traditional data +analytics workflow, where the edge data are gathered by a centralized server to +be further utilized by data analysts. To continue leveraging vast edge data to +support various data-incentive applications, a transformative shift is promoted +in computing paradigms from centralized data processing to privacy-preserved +distributed data processing. The need to perform data analytics on private edge +data motivates federated analytics (FA), an emerging technique to support +collaborative data analytics among diverse data owners without centralizing the +raw data. Despite the wide applications of FA in industry and academia, a +comprehensive examination of existing research efforts in FA has been notably +absent. This survey aims to bridge this gap by first providing an overview of +FA, elucidating key concepts, and discussing its relationship with similar +concepts. We then conduct a thorough examination of FA, including its key +challenges, taxonomy, and enabling techniques. Diverse FA applications, +including statistical metrics, frequency-related applications, database query +operations, FL-assisting FA tasks, and other wireless network applications are +then carefully reviewed. We complete the survey with several open research +issues, future directions, and a comprehensive lessons learned part. This +survey intends to provide a holistic understanding of the emerging FA +techniques and foster the continued evolution of privacy-preserving distributed +data processing in the emerging networked society. + +
+
+ comment: This survey has been submitted to IEEE Communications Surveys & + Tutorials +
+
+
+
+
+ + ♻ ☆ MergeSFL: Split Federated Learning with Feature Merging and Batch Size + Regulation + + +
+ Recently, federated learning (FL) has emerged as a popular technique for edge +AI to mine valuable knowledge in edge computing (EC) systems. To mitigate the +computing/communication burden on resource-constrained workers and protect +model privacy, split federated learning (SFL) has been released by integrating +both data and model parallelism. Despite resource limitations, SFL still faces +two other critical challenges in EC, i.e., statistical heterogeneity and system +heterogeneity. To address these challenges, we propose a novel SFL framework, +termed MergeSFL, by incorporating feature merging and batch size regulation in +SFL. Concretely, feature merging aims to merge the features from workers into a +mixed feature sequence, which is approximately equivalent to the features +derived from IID data and is employed to promote model accuracy. While batch +size regulation aims to assign diverse and suitable batch sizes for +heterogeneous workers to improve training efficiency. Moreover, MergeSFL +explores to jointly optimize these two strategies upon their coupled +relationship to better enhance the performance of SFL. Extensive experiments +are conducted on a physical platform with 80 NVIDIA Jetson edge devices, and +the experimental results show that MergeSFL can improve the final model +accuracy by 5.82% to 26.22%, with a speedup by about 1.74x to 4.14x, compared +to the baselines. + +
+
+
+
+
+ + ♻ ☆ Honeybee: Decentralized Peer Sampling with Verifiable Random Walks for + Blockchain Data Sharding + + +
+ Data sharding$\unicode{x2013}$in which block data is sharded without sharding +compute$\unicode{x2013}$is at the present the favored approach for scaling +Ethereum and other popular blockchains. A key challenge toward implementing +data sharding is verifying whether the entirety of a block's data is available +in the network (across its shards). A central technique proposed to conduct +this verification uses erasure-coded blocks and is called data availability +sampling (DAS). While the high-level protocol details of DAS have been well +discussed in the community, discussions around how such a protocol will be +implemented at the peer-to-peer layer are lacking. We identify random sampling +of nodes as a fundamental primitive necessary to carry out DAS and present +Honeybee, a decentralized algorithm for sampling nodes that uses verifiable +random walks. Honeybee is secure against attacks even in the presence of a +large number of Byzantine nodes (e.g., 50% of the network). We evaluate +Honeybee through experiments and show that the quality of sampling achieved by +Honeybee is significantly better compared to the state-of-the-art. Our proposed +algorithm has implications for DAS functions in both full nodes and light +nodes. + +
+
+ comment: 26 pages +
+
+
+
+
+
+
+
+ + Programming and Languages 3 + +
+
+
+ + ☆ Preventing Out-of-Gas Exceptions by Typing + + +
+ We continue the development of TinySol, a minimal object-oriented language +based on Solidity, the standard smart-contract language used for the Ethereum +platform. We first extend TinySol with exceptions and a gas mechanism, and +equip it with a small-step operational semantics. Introducing the gas mechanism +is fundamental for modelling real-life smart contracts in TinySol, since this +is the way in which termination of Ethereum smart contracts is usually ensured. +We then devise a type system for smart contracts guaranteeing that such +programs never run out of gas at runtime. This is a desirable property for +smart contracts, since a transaction that runs out of gas is aborted, but the +price paid to run the code is not returned to the invoker. + +
+
+
+
+
+ + ☆ MoXIchecker: An Extensible Model Checker for MoXI + + +
+ MoXI is a new intermediate verification language introduced in 2024 to +promote the standardization and open-source implementations for symbolic model +checking by extending the SMT-LIB 2 language with constructs to define +state-transition systems. The tool suite of MoXI provides a translator from +MoXI to Btor2, which is a lower-level intermediate language for hardware +verification, and a translation-based model checker, which invokes mature +hardware model checkers for Btor2 to analyze the translated verification tasks. +The extensibility of such a translation-based model checker is restricted +because more complex theories, such as integer or real arithmetics, cannot be +precisely expressed with bit-vectors of fixed lengths in Btor2. We present +MoXIchecker, the first model checker that solves MoXI verification tasks +directly. Instead of translating MoXI to lower-level languages, MoXIchecker +uses the solver-agnostic library PySMT for SMT solvers as backend for its +verification algorithms. MoXIchecker is extensible because it accommodates +verification tasks involving more complex theories, not limited by lower-level +languages, facilitates the implementation of new algorithms, and is +solver-agnostic by using the API of PySMT. In our evaluation, MoXIchecker +uniquely solved tasks that use integer or real arithmetics, and achieved a +comparable performance against the translation-based model checker from the +MoXI tool suite. + +
+
+ comment: 13 pages, 6 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Story of Your Lazy Function's Life: A Bidirectional Demand Semantics for + Mechanized Cost Analysis of Lazy Programs + + +
+ Lazy evaluation is a powerful tool that enables better compositionality and +potentially better performance in functional programming, but it is challenging +to analyze its computation cost. Existing works either require manually +annotating sharing, or rely on separation logic to reason about heaps of +mutable cells. In this paper, we propose a bidirectional demand semantics that +allows for extrinsic reasoning about the computation cost of lazy programs +without relying on special program logics. To show the effectiveness of our +approach, we apply the demand semantics to a variety of case studies including +insertion sort, selection sort, Okasaki's banker's queue, and the implicit +queue. We formally prove that the banker's queue and the implicit queue are +both amortized and persistent using the Rocq Prover (formerly known as Coq). We +also propose the reverse physicist's method, a novel variant of the classical +physicist's method, which enables mechanized, modular and compositional +reasoning about amortization and persistence with the demand semantics. + +
+
+ comment: Accepted by ICFP 2024 +
+
+
+
+
+
+
+
+ + Performance Profiling 2 + +
+
+
+ + ☆ The Bicameral Cache: a split cache for vector architectures + + +
+ The Bicameral Cache is a cache organization proposal for a vector +architecture that segregates data according to their access type, +distinguishing scalar from vector references. Its aim is to avoid both types of +references from interfering in each other's data locality, with a special focus +on prioritizing the performance on vector references. The proposed system +incorporates an additional, non-polluting prefetching mechanism to help +populate the long vector cache lines in advance to increase the hit rate by +further exploiting the spatial locality on vector data. Its evaluation was +conducted on the Cavatools simulator, comparing the performance to a standard +conventional cache, over different typical vector benchmarks for several vector +lengths. The results proved the proposed cache speeds up performance on +stride-1 vector benchmarks, while hardly impacting non-stride-1's. In addition, +the prefetching feature consistently provided an additional value. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ KWT-Tiny: RISC-V Accelerated, Embedded Keyword Spotting Transformer + + +
+ This paper explores the adaptation of Transformerbased models for edge +devices through the quantisation and hardware acceleration of the ARM Keyword +Transformer (KWT) model on a RISC-V platform. The model was targeted to run on +64kB RAM in bare-metal C using a custom-developed edge AI library. KWT-1 was +retrained to be 369 times smaller, with only a 10% loss in accuracy through +reducing output classes from 35 to 2. The retraining and quantisation reduced +model size from 2.42 MB to 1.65 kB. The integration of custom RISC-V +instructions that accelerated GELU and SoftMax operations enabled a 5x speedup +and thus ~5x power reduction in inference, with inference clock cycle counts +decreasing from 26 million to 5.5 million clock cycles while incurring a small +area overhead of approximately 29%. The results demonstrate a viable method for +porting and accelerating Transformer-based models in low-power IoT devices. + +
+
+ comment: 6 pages, 7 figures, accepted to be published in the IEEE SOCC 2024 + conference +
+
+
+
+
+
+
+
+ + Computational Complexity 2 + +
+
+
+ + ☆ Explaining Decisions in ML Models: a Parameterized Complexity Analysis + + +
+ This paper presents a comprehensive theoretical investigation into the +parameterized complexity of explanation problems in various machine learning +(ML) models. Contrary to the prevalent black-box perception, our study focuses +on models with transparent internal mechanisms. We address two principal types +of explanation problems: abductive and contrastive, both in their local and +global variants. Our analysis encompasses diverse ML models, including Decision +Trees, Decision Sets, Decision Lists, Ordered Binary Decision Diagrams, Random +Forests, and Boolean Circuits, and ensembles thereof, each offering unique +explanatory challenges. This research fills a significant gap in explainable AI +(XAI) by providing a foundational understanding of the complexities of +generating explanations for these models. This work provides insights vital for +further research in the domain of XAI, contributing to the broader discourse on +the necessity of transparency and accountability in AI systems. + +
+
+ comment: A short version of the paper has been accepted at the 21st + International Conference on Principles of Knowledge Representation and + Reasoning (KR 2024) +
+
+
+
+
+ + ♻ ☆ Separations in Proof Complexity and TFNP + + +
+ It is well-known that Resolution proofs can be efficiently simulated by +Sherali-Adams (SA) proofs. We show, however, that any such simulation needs to +exploit huge coefficients: Resolution cannot be efficiently simulated by SA +when the coefficients are written in unary. We also show that Reversible +Resolution (a variant of MaxSAT Resolution) cannot be efficiently simulated by +Nullstellensatz (NS). + These results have consequences for total NP search problems. First, we +characterise the classes PPADS, PPAD, SOPL by unary-SA, unary-NS, and +Reversible Resolution, respectively. Second, we show that, relative to an +oracle, PLS $\not\subseteq$ PPP, SOPL $\not\subseteq$ PPA, and EOPL +$\not\subseteq$ UEOPL. In particular, together with prior work, this gives a +complete picture of the black-box relationships between all classical TFNP +classes introduced in the 1990s. + +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 1 + +
+
+
+ + ♻ ☆ MLRegTest: A Benchmark for the Machine Learning of Regular Languages + + +
+ Synthetic datasets constructed from formal languages allow fine-grained +examination of the learning and generalization capabilities of machine learning +systems for sequence classification. This article presents a new benchmark for +machine learning systems on sequence classification called MLRegTest, which +contains training, development, and test sets from 1,800 regular languages. +Different kinds of formal languages represent different kinds of long-distance +dependencies, and correctly identifying long-distance dependencies in sequences +is a known challenge for ML systems to generalize successfully. MLRegTest +organizes its languages according to their logical complexity (monadic second +order, first order, propositional, or monomial expressions) and the kind of +logical literals (string, tier-string, subsequence, or combinations thereof). +The logical complexity and choice of literal provides a systematic way to +understand different kinds of long-distance dependencies in regular languages, +and therefore to understand the capacities of different ML systems to learn +such long-distance dependencies. Finally, the performance of different neural +networks (simple RNN, LSTM, GRU, transformer) on MLRegTest is examined. The +main conclusion is that performance depends significantly on the kind of test +set, the class of language, and the neural network architecture. + +
+
+ comment: 43 pages, MLRegTest benchmark available at + https://doi.org/10.5061/dryad.dncjsxm4h , associated code at + https://github.com/heinz-jeffrey/subregular-learning +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Distributed, Parallel, and Cluster Computing 6 + +
+
+
+ + ☆ LSM-GNN: Large-scale Storage-based Multi-GPU GNN Training by Optimizing + Data Transfer Scheme + + +
+ Graph Neural Networks (GNNs) are widely used today in recommendation systems, +fraud detection, and node/link classification tasks. Real world GNNs continue +to scale in size and require a large memory footprint for storing graphs and +embeddings that often exceed the memory capacities of the target GPUs used for +training. To address limited memory capacities, traditional GNN training +approaches use graph partitioning and sharding techniques to scale up across +multiple GPUs within a node and/or scale out across multiple nodes. However, +this approach suffers from the high computational costs of graph partitioning +algorithms and inefficient communication across GPUs. + To address these overheads, we propose Large-scale Storage-based Multi-GPU +GNN framework (LSM-GNN), a storagebased approach to train GNN models that +utilizes a novel communication layer enabling GPU software caches to function +as a system-wide shared cache with low overheads.LSM-GNN incorporates a hybrid +eviction policy that intelligently manages cache space by using both static and +dynamic node information to significantly enhance cache performance. +Furthermore, we introduce the Preemptive Victim-buffer Prefetcher (PVP), a +mechanism for prefetching node feature data from a Victim Buffer located in CPU +pinned-memory to further reduce the pressure on the storage devices. +Experimental results show that despite the lower compute capabilities and +memory capacities, LSM-GNN in a single node with two GPUs offers superior +performance over two-node-four-GPU Dist-DGL baseline and provides up to 3.75x +speed up on end-to-end epoch time while running large-scale GNN training + +
+
+
+
+
+ + ☆ Secure Web Objects: Building Blocks for Metaverse Interoperability and + Decentralization + + +
+ This position paper explores how to support the Web's evolution through an +underlying data-centric approach that better matches the data-orientedness of +modern and emerging applications. We revisit the original vision of the Web as +a hypermedia system that supports document composability and application +interoperability via name-based data access. We propose the use of secure web +objects (SWO), a data-oriented communication approach that can reduce +complexity, centrality, and inefficiency, particularly for collaborative and +local-first applications, such as the Metaverse and other collaborative +applications. SWO are named, signed, application-defined objects that are +secured independently of their containers or communications channels, an +approach that leverages the results from over a decade-long data-centric +networking research. This approach does not require intermediation by +aggregators of identity, storage, and other services that are common today. We +present a brief design overview, illustrated through prototypes for two editors +of shared hypermedia documents: one for 3D and one for LaTeX. We also discuss +our findings and suggest a roadmap for future research. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Lessons Learned on the Path to Guaranteeing the Error Bound in Lossy + Quantizers + + +
+ Rapidly increasing data sizes in scientific computing are the driving force +behind the need for lossy compression. The main drawback of lossy data +compression is the introduction of error. This paper explains why many +error-bounded compressors occasionally violate the error bound and presents the +solutions we use in LC, a CPU/GPU compatible lossy compression framework, to +guarantee the error bound for all supported types of quantizers. We show that +our solutions maintain high compression ratios and cause no appreciable change +in throughput. + +
+
+ comment: 12 pages, 4 figures, 9 tables, presented at the CAV 2024 Workshop on + Correct Data Compression +
+
+
+
+
+ + ♻ ☆ DPDPU: Data Processing with DPUs + + +
+ Improving the performance and reducing the cost of cloud data systems is +increasingly challenging. Data processing units (DPUs) are a promising +solution, but utilizing them for data processing needs characterizing the new +hardware and recognizing their capabilities and constraints. We hence propose +DPDPU, a platform for holistically exploiting DPUs to optimize data processing +tasks that are critical to performance and cost. It seeks to fill the semantic +gap between DPUs and data processing systems and handle DPU heterogeneity with +three engines dedicated to compute, networking, and storage. This paper +describes our vision, DPDPU's key components, their associated utilization +challenges, as well as the current progress and future plans. + +
+
+
+
+
+ + ♻ ☆ Split Learning without Local Weight Sharing to Enhance Client-side Data + Privacy + + +
+ Split learning (SL) aims to protect user data privacy by distributing deep +models between client-server and keeping private data locally. In SL training +with multiple clients, the local model weights are shared among the clients for +local model update. This paper first reveals data privacy leakage exacerbated +from local weight sharing among the clients in SL through model inversion +attacks. Then, to reduce the data privacy leakage issue, we propose and analyze +privacy-enhanced SL (P-SL) (or SL without local weight sharing). We further +propose parallelized P-SL to expedite the training process by duplicating +multiple server-side model instances without compromising accuracy. Finally, we +explore P-SL with late participating clients and devise a server-side +cache-based training method to address the forgetting phenomenon in SL when +late clients join. Experimental results demonstrate that P-SL helps reduce up +to 50% of client-side data leakage, which essentially achieves a better +privacy-accuracy trade-off than the current trend by using differential privacy +mechanisms. Moreover, P-SL and its cache-based version achieve comparable +accuracy to baseline SL under various data distributions, while cost less +computation and communication. Additionally, caching-based training in P-SL +mitigates the negative effect of forgetting, stabilizes the learning, and +enables practical and low-complexity training in a dynamic environment with +late-arriving clients. + +
+
+
+
+
+ + ♻ ☆ BAFFLE: A Baseline of Backpropagation-Free Federated Learning + + +
+ Federated learning (FL) is a general principle for decentralized clients to +train a server model collectively without sharing local data. FL is a promising +framework with practical applications, but its standard training paradigm +requires the clients to backpropagate through the model to compute gradients. +Since these clients are typically edge devices and not fully trusted, executing +backpropagation on them incurs computational and storage overhead as well as +white-box vulnerability. In light of this, we develop backpropagation-free +federated learning, dubbed BAFFLE, in which backpropagation is replaced by +multiple forward processes to estimate gradients. BAFFLE is 1) memory-efficient +and easily fits uploading bandwidth; 2) compatible with inference-only hardware +optimization and model quantization or pruning; and 3) well-suited to trusted +execution environments, because the clients in BAFFLE only execute forward +propagation and return a set of scalars to the server. Empirically we use +BAFFLE to train deep models from scratch or to finetune pretrained models, +achieving acceptable results. Code is available in +https://github.com/FengHZ/BAFFLE. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 1 + +
+
+
+ + ♻ ☆ On Quantum Pushdown Systems, Extensions + + +
+ In this paper, we first define the quantum analogues of the {\em +probabilistic pushdown systems} and {\em Markov chains}, and investigate the +question whether it is necessary to define a quantum analogue of {\em +probabilistic computational tree logic} to describe the probabilistic and +branching-time properties of the {\em quantum Markov chain} defined here. We +study its model-checking question and show that the model-checking of {\em +stateless quantum pushdown systems (qBPA)} against {\em probabilistic +computational tree logic (PCTL)} is generally undecidable. + We next define the notion of {\em probabilistic $\omega$-pushdown automaton} +for the first time and study the model-checking question of {\em stateless +probabilistic $\omega$-pushdown system ($\omega$-pBPA)} against $\omega$-PCTL +(defined by Chatterjee et al. in \cite{CSH08}) and show that the model-checking +of {\em stateless probabilistic $\omega$-pushdown systems ($\omega$-pBPA)} +against $\omega$-PCTL is generally undecidable. Our approach is to construct +formulas of $\omega$-PCTL encoding the {\em Post Correspondence Problem} +indirectly. + +
+
+ comment: [v9] not finished; comments are welcome. arXiv admin note: + substantial text overlap with arXiv:1405.4806 +
+
+
+
+
+
+
+
+ + Logic in Computer Science 6 + +
+
+
+ + ☆ Error Detection and Constraint Recovery in Hierarchical Multi-Label + Classification without Prior Knowledge + + +
+ Recent advances in Hierarchical Multi-label Classification (HMC), +particularly neurosymbolic-based approaches, have demonstrated improved +consistency and accuracy by enforcing constraints on a neural model during +training. However, such work assumes the existence of such constraints +a-priori. In this paper, we relax this strong assumption and present an +approach based on Error Detection Rules (EDR) that allow for learning +explainable rules about the failure modes of machine learning models. We show +that these rules are not only effective in detecting when a machine learning +classifier has made an error but also can be leveraged as constraints for HMC, +thereby allowing the recovery of explainable constraints even if they are not +provided. We show that our approach is effective in detecting machine learning +errors and recovering constraints, is noise tolerant, and can function as a +source of knowledge for neurosymbolic models on multiple datasets, including a +newly introduced military vehicle recognition dataset. + +
+
+
+
+
+ + ☆ A Cobham theorem for scalar multiplication + + +
+ Let $\alpha,\beta \in \mathbb{R}_{>0}$ be such that $\alpha,\beta$ are +quadratic and $\mathbb{Q}(\alpha)\neq \mathbb{Q}(\beta)$. Then every subset of +$\mathbb{R}^n$ definable in both $(\mathbb{R},{<},+,\mathbb{Z},x\mapsto \alpha +x)$ and $(\mathbb{R},{<},+,\mathbb{Z},x\mapsto \beta x)$ is already definable +in $(\mathbb{R},{<},+,\mathbb{Z})$. As a consequence we generalize +Cobham-Semenov theorems for sets of real numbers to $\beta$-numeration systems, +where $\beta$ is a quadratic irrational. + +
+
+
+
+
+ + ♻ ☆ Tree algebras and bisimulation-invariant MSO on finite graphs + + +
+ We establish that the bisimulation invariant fragment of MSO over finite +transition systems is expressively equivalent over finite transition systems to +modal mu-calculus, a question that had remained open for several decades. The +proof goes by translating the question to an algebraic framework, and showing +that the languages of regular trees that are recognized by finitary tree +algebras whose sorts zero and one are finite are the regular ones, ie. the ones +expressible in mu-calculus. This corresponds for trees to a weak form of the +key translation of Wilke algebras to omega-semigroup over infinite words, and +was also a missing piece in the algebraic theory of regular languages of +infinite trees since twenty years. + +
+
+
+
+
+ + ♻ ☆ Compact Proofs of Model Performance via Mechanistic Interpretability + + +
+ We propose using mechanistic interpretability -- techniques for reverse +engineering model weights into human-interpretable algorithms -- to derive and +compactly prove formal guarantees on model performance. We prototype this +approach by formally proving lower bounds on the accuracy of 151 small +transformers trained on a Max-of-$K$ task. We create 102 different +computer-assisted proof strategies and assess their length and tightness of +bound on each of our models. Using quantitative metrics, we find that shorter +proofs seem to require and provide more mechanistic understanding. Moreover, we +find that more faithful mechanistic understanding leads to tighter performance +bounds. We confirm these connections by qualitatively examining a subset of our +proofs. Finally, we identify compounding structureless noise as a key challenge +for using mechanistic interpretability to generate compact proofs on model +performance. + +
+
+ comment: accepted to ICML 2024 Workshop on Mechanistic Interpretability + (Spotlight) +
+
+
+
+
+ + ♻ ☆ On Quantum Pushdown Systems, Extensions + + +
+ In this paper, we first define the quantum analogues of the {\em +probabilistic pushdown systems} and {\em Markov chains}, and investigate the +question whether it is necessary to define a quantum analogue of {\em +probabilistic computational tree logic} to describe the probabilistic and +branching-time properties of the {\em quantum Markov chain} defined here. We +study its model-checking question and show that the model-checking of {\em +stateless quantum pushdown systems (qBPA)} against {\em probabilistic +computational tree logic (PCTL)} is generally undecidable. + We next define the notion of {\em probabilistic $\omega$-pushdown automaton} +for the first time and study the model-checking question of {\em stateless +probabilistic $\omega$-pushdown system ($\omega$-pBPA)} against $\omega$-PCTL +(defined by Chatterjee et al. in \cite{CSH08}) and show that the model-checking +of {\em stateless probabilistic $\omega$-pushdown systems ($\omega$-pBPA)} +against $\omega$-PCTL is generally undecidable. Our approach is to construct +formulas of $\omega$-PCTL encoding the {\em Post Correspondence Problem} +indirectly. + +
+
+ comment: [v9] not finished; comments are welcome. arXiv admin note: + substantial text overlap with arXiv:1405.4806 +
+
+
+
+
+ + ♻ ☆ Learn from Failure: Fine-Tuning LLMs with Trial-and-Error Data for + Intuitionistic Propositional Logic Proving + + +
+ Recent advances in Automated Theorem Proving have shown the effectiveness of +leveraging a (large) language model that generates tactics (i.e. proof steps) +to search through proof states. The current model, while trained solely on +successful proof paths, faces a discrepancy at the inference stage, as it must +sample and try various tactics at each proof state until finding success, +unlike its training which does not incorporate learning from failed attempts. +Intuitively, a tactic that leads to a failed search path would indicate that +similar tactics should receive less attention during the following trials. In +this paper, we demonstrate the benefit of training models that additionally +learn from failed search paths. Facing the lack of such trial-and-error data in +existing open-source theorem-proving datasets, we curate a dataset on +intuitionistic propositional logic theorems and formalize it in Lean, such that +we can reliably check the correctness of proofs. We compare our model trained +on relatively short trial-and-error information (TrialMaster) with models +trained only on the correct paths and discover that the former solves more +unseen theorems with lower trial searches. + +
+
+ comment: Accepted as a main conference paper at ACL 2024 +
+
+
+
+
+
+
+
+ + Hardware Architecturea 3 + +
+
+
+ + ☆ Token-Picker: Accelerating Attention in Text Generation with Minimized + Memory Transfer via Probability Estimation + + +
+ The attention mechanism in text generation is memory-bounded due to its +sequential characteristics. Therefore, off-chip memory accesses should be +minimized for faster execution. Although previous methods addressed this by +pruning unimportant tokens, they fall short in selectively removing tokens with +near-zero attention probabilities in each instance. Our method estimates the +probability before the softmax function, effectively removing low probability +tokens and achieving an 12.1x pruning ratio without fine-tuning. Additionally, +we present a hardware design supporting seamless on-demand off-chip access. Our +approach shows 2.6x reduced memory accesses, leading to an average 2.3x speedup +and a 2.4x energy efficiency. + +
+
+ comment: To appear in the proceedings of 61st Design Automation Conference + (DAC) +
+
+
+
+
+ + ☆ AutoVCoder: A Systematic Framework for Automated Verilog Code Generation + using LLMs + + +
+ Recently, the use of large language models (LLMs) for software code +generation, e.g., C/C++ and Python, has proven a great success. However, LLMs +still suffer from low syntactic and functional correctness when it comes to the +generation of register-transfer level (RTL) code, such as Verilog. To address +this issue, in this paper, we develop AutoVCoder, a systematic open-source +framework that significantly improves the LLMs' correctness of generating +Verilog code and enhances the quality of its output at the same time. Our +framework integrates three novel techniques, including a high-quality hardware +dataset generation approach, a two-round LLM fine-tuning method and a +domain-specific retrieval-augmented generation (RAG) mechanism. Experimental +results demonstrate that AutoVCoder outperforms both industrial and academic +LLMs in Verilog code generation. Specifically, AutoVCoder shows a 0.5% and 2.2% +improvement in functional correctness on the EvalMachine and EvalHuman +benchmarks compared with BetterV, and also achieves a 3.4% increase in syntax +correctness and a 3.4% increase in functional correctness on the RTLLM +benchmark compared with RTLCoder. + +
+
+
+
+
+ + ☆ Large Language Model for Verilog Generation with Golden Code Feedback + + +
+ Recent advancements in large language models (LLMs) have catalyzed +significant interest in the automatic generation of Register-Transfer Level +(RTL) code, particularly Verilog, from natural language instructions. While +commercial LLMs like ChatGPT have dominated this domain, open-source +alternatives have lagged considerably in performance, limiting the flexibility +and data privacy of this emerging technology. This study introduces a novel +approach utilizing reinforcement learning with golden code feedback to enhance +the performance of pre-trained models. Leveraging open-source data and base +models, we have achieved state-of-the-art (SOTA) results with a substantial +margin. Notably, our 6.7B parameter model \ours{} demonstrates superior +performance compared to current best-in-class 13B and 16B models. Furthermore, +through a comprehensive analysis of the limitations in direct fine-tuning and +the training dynamics of reinforcement learning, we posit that the development +of comprehensive supervisory signals, which are align with the inherent +parallel semantics of Verilog code, is critical to effective generation. The +code and data associated with this research are publicly available at +\url{https://github.com/CatIIIIIIII/veriseek}. The model weights can be +accessed at \url{https://huggingface.co/WANGNingroci/VeriSeek}. + +
+
+
+
+
+
+
+
+ + Programming and Languages 2 + +
+
+
+ + ☆ SNIP: Speculative Execution and Non-Interference Preservation for + Compiler Transformations + + +
+ We address the problem of preserving non-interference across compiler +transformations under speculative semantics. We develop a proof method that +ensures the preservation uniformly across all source programs. The basis of our +proof method is a new form of simulation relation. It operates over directives +that model the attacker's control over the micro-architectural state, and it +accounts for the fact that the compiler transformation may change the influence +of the micro-architectural state on the execution (and hence the directives). +Using our proof method, we show the correctness of dead code elimination. When +we tried to prove register allocation correct, we identified a previously +unknown weakness that introduces violations to non-interference. We have +confirmed the weakness for a mainstream compiler on code from the libsodium +cryptographic library. To reclaim security once more, we develop a novel static +analysis that operates on a product of source program and register-allocated +program. Using the analysis, we present an automated fix to existing register +allocation implementations. We prove the correctness of the fixed register +allocations with our proof method. + +
+
+
+
+
+ + ♻ ☆ VerityMath: Advancing Mathematical Reasoning by Self-Verification + Through Unit Consistency + + +
+ Large Language Models (LLMs), combined with program-based solving techniques, +are increasingly demonstrating proficiency in mathematical reasoning. For +example, closed-source models such as OpenAI GPT-4 and Claude show excellent +results in solving math word problems. However, progress in math word +problem-solving for open-source LLMs is limited, and the challenges these +models face are not well-studied. In this paper, we study the performance of +strong open-source LLMs, including Llama 2 (7B), Code Llama (7B), and Mistral +(7B) on math word problems using program-based solving techniques. +Specifically, we analyze the outputs of these models when applied to math word +problems and identify a category of problems that pose a significant challenge, +particularly those involving quantities spanning multiple units. To address +this issue, we propose a systematic approach by defining the units for each +quantity and ensuring the consistency of these units during mathematical +operations. We developed Unit Consistency Programs (UCPs), an annotated dataset +of math word problems, each paired with programs containing unit specifications +and unit verification routines. We fine-tuned Llama 2 (7B), Code Llama (7B), +and Mistral (7B) models with UCPs to produce theirVerityMath variants. Our +findings indicate that our approach, which incorporates unit consistency, +currently slightly underperforms compared to an approach that does not. To +understand the reasons behind this, we conduct an in-depth error analysis and +suggest options for future improvements. Our code and dataset are available at +https://github.com/vernontoh/VerityMath. + +
+
+ comment: AI4MATH Workshop @ ICML 2024 +
+
+
+
+
+
+
+
+ + Computational Complexity 1 + +
+
+
+ + ☆ The Complexity of (P3, H)-Arrowing and Beyond + + +
+ Often regarded as the study of how order emerges from randomness, Ramsey +theory has played an important role in mathematics and computer science, giving +rise to applications in numerous domains such as logic, parallel processing, +and number theory. The core of graph Ramsey theory is arrowing: For fixed +graphs $F$ and $H$, the $(F, H)$-Arrowing problem asks whether a given graph, +$G$, has a red/blue coloring of the edges of $G$ such that there are no red +copies of $F$ and no blue copies of $H$. For some cases, the problem has been +shown to be coNP-complete, or solvable in polynomial time. However, a more +systematic approach is needed to categorize the complexity of all cases. + We focus on $(P_3, H)$-Arrowing as $F = P_3$ is the simplest meaningful case +for which the complexity question remains open, and the hardness for this case +likely extends to general $(F, H)$-Arrowing for nontrivial $F$. In this +pursuit, we also gain insight into the complexity of a class of matching +removal problems, since $(P_3, H)$-Arrowing is equivalent to $H$-free Matching +Removal. We show that $(P_3, H)$-Arrowing is coNP-complete for all +$2$-connected $H$ except when $H = K_3$, in which case the problem is in P. We +introduce a new graph invariant to help us carefully combine graphs when +constructing the gadgets for our reductions. Moreover, we show how +$(P_3,H)$-Arrowing hardness results can be extended to other $(F,H)$-Arrowing +problems. This allows for more intuitive and palatable hardness proofs instead +of ad-hoc constructions of SAT gadgets, bringing us closer to categorizing the +complexity of all $(F, H)$-Arrowing problems. + +
+
+ comment: To appear in MFCS 2024 +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`