From 1bae8f9f913ceb8ea509470cd44d099225094c4e Mon Sep 17 00:00:00 2001 From: Gaetan-007 Date: Sun, 26 May 2024 12:10:57 +0000 Subject: [PATCH] deploy: e2cacd40f873c7ef1fd8689bef437e751a8b066f --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 ++ index.html | 10470 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 10865 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..471a98e5 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-05-18T00:00:00Z":{"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2405.09398v2","updated":"2024-05-18T07:00:49Z","published":"2024-05-15T14:51:46Z","title":"Encrypted Container File: Design and Implementation of a\n Hybrid-Encrypted Multi-Recipient File Structure","summary":" Modern software engineering trends towards Cloud-native software development\nby international teams of developers. Cloud-based version management services,\nsuch as GitHub, are used for the source code and other artifacts created during\nthe development process. However, using such a service usually means that every\ndeveloper has access to all data stored on the platform. Particularly, if the\ndevelopers belong to different companies or organizations, it would be\ndesirable for sensitive files to be encrypted in such a way that these can only\nbe decrypted again by a group of previously defined people. In this paper, we\nexamine currently available tools that address this problem, but which have\ncertain shortcomings. We then present our own solution, Encrypted Container\nFiles (ECF), for this problem, eliminating the deficiencies found in the other\ntools.\n","authors":["Tobias J. Bauer","Andreas Aßmuth"],"pdf_url":"https://arxiv.org/pdf/2405.09398v2.pdf","comment":"7 pages, for associated implementation etc., see\n https://github.com/Hirnmoder/ECF"},{"id":"http://arxiv.org/abs/2304.05397v2","updated":"2024-05-18T23:52:22Z","published":"2023-04-10T19:13:14Z","title":"Accelerating Hybrid Federated Learning Convergence under Partial\n Participation","summary":" Over the past few years, Federated Learning (FL) has become a popular\ndistributed machine learning paradigm. FL involves a group of clients with\ndecentralized data who collaborate to learn a common model under the\ncoordination of a centralized server, with the goal of protecting clients'\nprivacy by ensuring that local datasets never leave the clients and that the\nserver only performs model aggregation. However, in realistic scenarios, the\nserver may be able to collect a small amount of data that approximately mimics\nthe population distribution and has stronger computational ability to perform\nthe learning process. To address this, we focus on the hybrid FL framework in\nthis paper. While previous hybrid FL work has shown that the alternative\ntraining of clients and server can increase convergence speed, it has focused\non the scenario where clients fully participate and ignores the negative effect\nof partial participation. In this paper, we provide theoretical analysis of\nhybrid FL under clients' partial participation to validate that partial\nparticipation is the key constraint on convergence speed. We then propose a new\nalgorithm called FedCLG, which investigates the two-fold role of the server in\nhybrid FL. Firstly, the server needs to process the training steps using its\nsmall amount of local datasets. Secondly, the server's calculated gradient\nneeds to guide the participated clients' training and the server's aggregation.\nWe validate our theoretical findings through numerical experiments, which show\nthat our proposed method FedCLG outperforms state-of-the-art methods.\n","authors":["Jieming Bian","Lei Wang","Kun Yang","Cong Shen","Jie Xu"],"pdf_url":"https://arxiv.org/pdf/2304.05397v2.pdf","comment":"Accepted by IEEE Transactions on Signal Processing, Update the\n convergence analysis and add more experiment results"},{"id":"http://arxiv.org/abs/2405.11368v1","updated":"2024-05-18T18:33:04Z","published":"2024-05-18T18:33:04Z","title":"Security of Cloud Services with Low-Performance Devices in Critical\n Infrastructures","summary":" As part of the Internet of Things (IoT) and Industry 4.0 Cloud services are\nincreasingly interacting with low-performance devices that are used in\nautomation. This results in security issues that will be presented in this\npaper. Particular attention is paid to so-called critical infrastructures. The\nauthors intend to work on the addressed security challenges as part of a funded\nresearch project, using electrical actuators and battery storages as specific\napplications. The core ideas of this research project are also presented in\nthis paper.\n","authors":["Michael Molle","Ulrich Raithel","Dirk Kraemer","Norbert Graß","Matthias Söllner","Andreas Aßmuth"],"pdf_url":"https://arxiv.org/pdf/2405.11368v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2405.11350v1","updated":"2024-05-18T17:42:02Z","published":"2024-05-18T17:42:02Z","title":"Cloud Security and Security Challenges Revisited","summary":" In recent years, Cloud Computing has transformed local businesses and created\nnew business models on the Internet- and Cloud services are still flourishing.\nBut after the emphatic hype in the early years, a more realistic perception of\nCloud services has emerged. One reason for this surely is that today, Cloud\nComputing is considered as an established and well-accepted technology and no\nlonger as a technical novelty. But the second reason for this assessment might\nalso be numerous security issues that Cloud Computing in general or specific\nCloud services have experienced since then. In this paper, we revisit attacks\non Cloud services and Cloud-related attack vectors that have been published in\nrecent years. We then consider successful or proposed solutions to cope with\nthese challenges. Based on these findings, we apply a security metric in order\nto rank all these Cloud-related security challenges concerning their severity.\nThis should assist security professionals to prioritize their efforts toward\naddressing these issues.\n","authors":["Fabian Süß","Marco Freimuth","Andreas Aßmuth","George R. S. Weir","Bob Duncan"],"pdf_url":"https://arxiv.org/pdf/2405.11350v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2405.11341v1","updated":"2024-05-18T17:10:48Z","published":"2024-05-18T17:10:48Z","title":"A Secure and Privacy-Friendly Logging Scheme","summary":" Finding a robust security mechanism for audit trail logging has long been a\npoorly satisfied goal. There are many reasons for this. The most significant of\nthese is that the audit trail is a highly sought after goal of attackers to\nensure that they do not get caught. Thus they have an incredibly strong\nincentive to prevent companies from succeeding in this worthy aim. Regulation,\nsuch as the European Union General Data Protection Regulation, has brought a\nstrong incentive for companies to achieve success in this area due to the\npunitive level of fines that can now be levied in the event of a successful\nbreach by an attacker. We seek to resolve this issue through the use of an\nencrypted audit trail process that saves encrypted records to a true immutable\ndatabase, which can ensure audit trail records are permanently retained in\nencrypted form, with no possibility of the records being compromised. This\nensures compliance with the General Data Protection Regulation can be achieved.\n","authors":["Andreas Aßmuth","Robert Duncan","Simon Liebl","Matthias Söllner"],"pdf_url":"https://arxiv.org/pdf/2405.11341v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2405.11316v1","updated":"2024-05-18T15:26:38Z","published":"2024-05-18T15:26:38Z","title":"Securing 3rd Party App Integration in Docker-based Cloud Software\n Ecosystems","summary":" Open software ecosystems are beneficial for customers; they benefit from 3rd\nparty services and applications, e.g. analysis of data using apps, developed\nand deployed by other companies or open-source communities. One significant\nadvantage of this approach is that other customers may benefit from these newly\ndeveloped applications as well. Especially software ecosystems utilizing\ncontainer technologies are prone to certain risks. Docker, in particular, is\nmore vulnerable to attacks than hypervisor based virtualisation as it directly\noperates on the host system. Docker is a popular representative of\ncontainerisation technology which offers a lightweight architecture in order to\nfacilitate the set-up and creation of such software ecosystems. Popular\nInfrastructure as a Service cloud service providers, like Amazon Web Services\nor Microsoft Azure, jump on the containerisation bandwagon and provide\ninterfaces for provisioning and managing containers. Companies can benefit from\nthat change of technology and create software ecosystems more efficiently. In\nthis paper, we present a new concept for significant security improvements for\ncloud-based software ecosystems using Docker for 3rd party app integration.\nBased on the security features of Docker we describe a secure integration of\napplications in the cloud environment securely. Our approach considers the\nwhole software lifecycle and includes sandbox testing of potentially dangerous\n3rd party apps before these became available to the customers.\n","authors":["Christian Binkowski","Stefan Appel","Andreas Aßmuth"],"pdf_url":"https://arxiv.org/pdf/2405.11316v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2405.11281v1","updated":"2024-05-18T12:45:00Z","published":"2024-05-18T12:45:00Z","title":"Cooperative Cognitive Dynamic System in UAV Swarms: Reconfigurable\n Mechanism and Framework","summary":" As the demands for immediate and effective responses increase in both\ncivilian and military domains, the unmanned aerial vehicle (UAV) swarms emerge\nas effective solutions, in which multiple cooperative UAVs can work together to\nachieve specific goals. However, how to manage such complex systems to ensure\nreal-time adaptability lack sufficient researches. Hence, in this paper, we\npropose the cooperative cognitive dynamic system (CCDS), to optimize the\nmanagement for UAV swarms. CCDS leverages a hierarchical and cooperative\ncontrol structure that enables real-time data processing and decision.\nAccordingly, CCDS optimizes the UAV swarm management via dynamic\nreconfigurability and adaptive intelligent optimization. In addition, CCDS can\nbe integrated with the biomimetic mechanism to efficiently allocate tasks for\nUAV swarms. Further, the distributed coordination of CCDS ensures reliable and\nresilient control, thus enhancing the adaptability and robustness. Finally, the\npotential challenges and future directions are analyzed, to provide insights\ninto managing UAV swarms in dynamic heterogeneous networking.\n","authors":["Ziye Jia","Jiahao You","Chao Dong","Qihui Wu","Fuhui Zhou","Dusit Niyato","Zhu Han"],"pdf_url":"https://arxiv.org/pdf/2405.11281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11183v1","updated":"2024-05-18T05:31:48Z","published":"2024-05-18T05:31:48Z","title":"MultiPaxos Made Complete","summary":" MultiPaxos, while a fundamental Replicated State Machine algorithm, suffers\nfrom a dearth of comprehensive guidelines for achieving a complete and correct\nimplementation. This deficiency has hindered MultiPaxos' practical utility and\nadoption and has resulted in flawed claims about its capabilities. Our paper\naims to bridge the gap between MultiPaxos' complexity and practical\nimplementation through a meticulous and detailed design process spanning more\nthan a year. It carefully dissects each phase of MultiPaxos and offers detailed\nstep-by-step pseudocode -- in addition to a complete open-source implementation\n-- for all components, including the leader election, the failure detector, and\nthe commit phase.\n The implementation of our complete design also provides better performance\nstability, resource usage, and network partition tolerance than naive\nMultiPaxos versions. Our specification includes a lightweight log compaction\napproach that avoids taking repeated snapshots, significantly improving\nresource usage and performance stability. Our failure detector, integrated into\nthe commit phase of the algorithm, uses variable and adaptive heartbeat\nintervals to settle on a better leader under partial connectivity and network\npartitions, improving liveness under such conditions.\n","authors":["Zhiying Liang","Vahab Jabrayilov","Aleksey Charapko","Abutalib Aghayev"],"pdf_url":"https://arxiv.org/pdf/2405.11183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11182v1","updated":"2024-05-18T05:29:22Z","published":"2024-05-18T05:29:22Z","title":"The Cost of Garbage Collection for State Machine Replication","summary":" State Machine Replication (SMR) protocols form the backbone of many\ndistributed systems. Enterprises and startups increasingly build their\ndistributed systems on the cloud due to its many advantages, such as\nscalability and cost-effectiveness. One of the first technical questions\ncompanies face when building a system on the cloud is which programming\nlanguage to use. Among many factors that go into this decision is whether to\nuse a language with garbage collection (GC), such as Java or Go, or a language\nwith manual memory management, such as C++ or Rust. Today, companies\npredominantly prefer languages with GC, like Go, Kotlin, or even Python, due to\nease of development; however, there is no free lunch: GC costs resources\n(memory and CPU) and performance (long tail latencies due to GC pauses). While\nthere have been anecdotal reports of reduced cloud cost and improved tail\nlatencies when switching from a language with GC to a language with manual\nmemory management, so far, there has not been a systematic study of the GC\noverhead of running an SMR-based cloud system.\n This paper studies the overhead of running an SMR-based cloud system written\nin a language with GC. To this end, we design from scratch a canonical SMR\nsystem -- a MultiPaxos-based replicated in-memory key-value store -- and we\nimplement it in C++, Java, Rust, and Go. We compare the performance and\nresource usage of these implementations when running on the cloud under\ndifferent workloads and resource constraints and report our results. Our\nfindings have implications for the design of cloud systems.\n","authors":["Zhiying Liang","Vahab Jabrayilov","Aleksey Charapko","Abutalib Aghayev"],"pdf_url":"https://arxiv.org/pdf/2405.11182v1.pdf","comment":"17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2405.13043v1","updated":"2024-05-18T08:03:31Z","published":"2024-05-18T08:03:31Z","title":"Towards Specialized Supercomputers for Climate Sciences: Computational\n Requirements of the Icosahedral Nonhydrostatic Weather and Climate Model","summary":" We discuss the computational challenges and requirements for high-resolution\nclimate simulations using the Icosahedral Nonhydrostatic Weather and Climate\nModel (ICON). We define a detailed requirements model for ICON which emphasizes\nthe need for specialized supercomputers to accurately predict climate change\nimpacts and extreme weather events. Based on the requirements model, we outline\ncomputational demands for km-scale simulations, and suggests machine learning\ntechniques to enhance model accuracy and efficiency. Our findings aim to guide\nthe design of future supercomputers for advanced climate science.\n","authors":["Torsten Hoefler","Alexandru Calotoiu","Anurag Dipankar","Thomas Schulthess","Xavier Lapillonne","Oliver Fuhrer"],"pdf_url":"https://arxiv.org/pdf/2405.13043v1.pdf","comment":null}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2405.08051v2","updated":"2024-05-18T16:46:12Z","published":"2024-05-13T16:35:14Z","title":"P=NP","summary":" This paper investigates an extremely classic NP-complete problem: How to\ndetermine if a graph G, where each vertex has a degree of at most 4, can be\n3-colorable(The research in this paper focuses on graphs G that satisfy the\ncondition where the degree of each vertex does not exceed 4. To conserve space,\nit is assumed throughout the paper that graph G meets this condition by\ndefault.). The author has meticulously observed the relationship between the\ncoloring problem and semidefinite programming, and has creatively constructed\nthe corresponding semidefinite programming problem R(G) for a given graph G.\nThe construction method of R(G) refers to Theorem 1.1 in the paper. I have\nobtained and proven the conclusion: A graph G is 3-colorable if and only if the\nobjective function of its corresponding optimization problem R(G) is bounded,\nand when the objective function is bounded, its minimum value is 0.\n","authors":["Zikang Deng"],"pdf_url":"https://arxiv.org/pdf/2405.08051v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03069v2","updated":"2024-05-18T20:14:42Z","published":"2024-05-05T22:32:01Z","title":"On Probabilistic and Causal Reasoning with Summation Operators","summary":" Ibeling et al. (2023). axiomatize increasingly expressive languages of\ncausation and probability, and Mosse et al. (2024) show that reasoning\n(specifically the satisfiability problem) in each causal language is as\ndifficult, from a computational complexity perspective, as reasoning in its\nmerely probabilistic or \"correlational\" counterpart. Introducing a summation\noperator to capture common devices that appear in applications -- such as the\n$do$-calculus of Pearl (2009) for causal inference, which makes ample use of\nmarginalization -- van der Zander et al. (2023) partially extend these earlier\ncomplexity results to causal and probabilistic languages with marginalization.\nWe complete this extension, fully characterizing the complexity of\nprobabilistic and causal reasoning with summation, demonstrating that these\nagain remain equally difficult. Surprisingly, allowing free variables for\nrandom variable values results in a system that is undecidable, so long as the\nranges of these random variables are unrestricted. We finally axiomatize these\nlanguages featuring marginalization (or more generally summation), resolving\nopen questions posed by Ibeling et al. (2023).\n","authors":["Duligur Ibeling","Thomas F. Icard","Milan Mossé"],"pdf_url":"https://arxiv.org/pdf/2405.03069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03343v3","updated":"2024-05-18T18:47:22Z","published":"2022-10-07T06:14:11Z","title":"On the complexity of symmetric vs. functional PCSPs","summary":" The complexity of the promise constraint satisfaction problem\n$\\operatorname{PCSP}(\\mathbf{A},\\mathbf{B})$ is largely unknown, even for\nsymmetric $\\mathbf{A}$ and $\\mathbf{B}$, except for the case when $\\mathbf{A}$\nand $\\mathbf{B}$ are Boolean.\n First, we establish a dichotomy for\n$\\operatorname{PCSP}(\\mathbf{A},\\mathbf{B})$ where $\\mathbf{A}, \\mathbf{B}$ are\nsymmetric, $\\mathbf{B}$ is functional (i.e. any $r-1$ elements of an $r$-ary\ntuple uniquely determines the last one), and $(\\mathbf{A},\\mathbf{B})$\nsatisfies technical conditions we introduce called dependency and additivity.\nThis result implies a dichotomy for\n$\\operatorname{PCSP}(\\mathbf{A},\\mathbf{B})$ with $\\mathbf{A},\\mathbf{B}$\nsymmetric and $\\mathbf{B}$ functional if (i) $\\mathbf{A}$ is Boolean, or (ii)\n$\\mathbf{A}$ is a hypergraph of a small uniformity, or (iii) $\\mathbf{A}$ has a\nrelation $R^{\\mathbf{A}}$ of arity at least 3 such that the hypergraph diameter\nof $(A, R^{\\mathbf{A}})$ is at most 1.\n Second, we show that for $\\operatorname{PCSP}(\\mathbf{A},\\mathbf{B})$, where\n$\\mathbf{A}$ and $\\mathbf{B}$ contain a single relation, $\\mathbf{A}$ satisfies\na technical condition called balancedness, and $\\mathbf{B}$ is arbitrary, the\ncombined basic linear programming relaxation (BLP) and the affine integer\nprogramming relaxation (AIP) is no more powerful than the (in general strictly\nweaker) AIP relaxation. Balanced $\\mathbf{A}$ include symmetric $\\mathbf{A}$\nor, more generally, $\\mathbf{A}$ preserved by a transitive permutation group.\n","authors":["Tamio-Vesa Nakajima","Stanislav Živný"],"pdf_url":"https://arxiv.org/pdf/2210.03343v3.pdf","comment":"Full version (with stronger results) of a LICS'23 paper"},{"id":"http://arxiv.org/abs/2405.11155v1","updated":"2024-05-18T02:59:32Z","published":"2024-05-18T02:59:32Z","title":"Inner-approximate Reachability Computation via Zonotopic Boundary\n Analysis","summary":" Inner-approximate reachability analysis involves calculating subsets of\nreachable sets, known as inner-approximations. This analysis is crucial in the\nfields of dynamic systems analysis and control theory as it provides a reliable\nestimation of the set of states that a system can reach from given initial\nstates at a specific time instant. In this paper, we study the\ninner-approximate reachability analysis problem based on the set-boundary\nreachability method for systems modelled by ordinary differential equations, in\nwhich the computed inner-approximations are represented with zonotopes. The\nset-boundary reachability method computes an inner-approximation by excluding\nstates reached from the initial set's boundary. The effectiveness of this\nmethod is highly dependent on the efficient extraction of the exact boundary of\nthe initial set. To address this, we propose methods leveraging boundary and\ntiling matrices that can efficiently extract and refine the exact boundary of\nthe initial set represented by zonotopes. Additionally, we enhance the\nexclusion strategy by contracting the outer-approximations in a flexible way,\nwhich allows for the computation of less conservative inner-approximations. To\nevaluate the proposed method, we compare it with state-of-the-art methods\nagainst a series of benchmarks. The numerical results demonstrate that our\nmethod is not only efficient but also accurate in computing\ninner-approximations.\n","authors":["Dejin Ren","Zhen Liang","Chenyu Wu","Jianqiang Ding","Taoran Wu","Bai Xue"],"pdf_url":"https://arxiv.org/pdf/2405.11155v1.pdf","comment":"the full version of the paper accepted by CAV 2024"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2405.03069v2","updated":"2024-05-18T20:14:42Z","published":"2024-05-05T22:32:01Z","title":"On Probabilistic and Causal Reasoning with Summation Operators","summary":" Ibeling et al. (2023). axiomatize increasingly expressive languages of\ncausation and probability, and Mosse et al. (2024) show that reasoning\n(specifically the satisfiability problem) in each causal language is as\ndifficult, from a computational complexity perspective, as reasoning in its\nmerely probabilistic or \"correlational\" counterpart. Introducing a summation\noperator to capture common devices that appear in applications -- such as the\n$do$-calculus of Pearl (2009) for causal inference, which makes ample use of\nmarginalization -- van der Zander et al. (2023) partially extend these earlier\ncomplexity results to causal and probabilistic languages with marginalization.\nWe complete this extension, fully characterizing the complexity of\nprobabilistic and causal reasoning with summation, demonstrating that these\nagain remain equally difficult. Surprisingly, allowing free variables for\nrandom variable values results in a system that is undecidable, so long as the\nranges of these random variables are unrestricted. We finally axiomatize these\nlanguages featuring marginalization (or more generally summation), resolving\nopen questions posed by Ibeling et al. (2023).\n","authors":["Duligur Ibeling","Thomas F. Icard","Milan Mossé"],"pdf_url":"https://arxiv.org/pdf/2405.03069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18995v2","updated":"2024-05-18T20:10:48Z","published":"2024-03-27T20:28:30Z","title":"Algebraic Reasoning Meets Automata in Solving Linear Integer Arithmetic\n (Technical Report)","summary":" We present a new angle on solving quantified linear integer arithmetic based\non combining the automata-based approach, where numbers are understood as\nbitvectors, with ideas from (nowadays prevalent) algebraic approaches, which\nwork directly with numbers. This combination is enabled by a fine-grained\nversion of the duality between automata and arithmetic formulae. In particular,\nwe employ a construction where states of automaton are obtained as derivatives\nof arithmetic formulae: then every state corresponds to a formula.\nOptimizations based on techniques and ideas transferred from the world of\nalgebraic methods are used on thousands of automata states, which dramatically\namplifies their effect. The merit of this combination of automata with\nalgebraic methods is demonstrated by our prototype implementation being\ncompetitive to and even superior to state-of-the-art SMT solvers.\n","authors":["Peter Habermehl","Vojtěch Havlena","Michal Hečko","Lukáš Holík","Ondřej Lengál"],"pdf_url":"https://arxiv.org/pdf/2403.18995v2.pdf","comment":"Accepted to CAV'24"},{"id":"http://arxiv.org/abs/2402.01982v2","updated":"2024-05-18T19:01:39Z","published":"2024-02-03T01:40:13Z","title":"A Proof-theoretic Semantics for Intuitionistic Linear Logic","summary":" The approach taken by Gheorghiu, Gu and Pym in their paper on giving a\nBase-extension Semantics for Intuitionistic Multiplicative Linear Logic is an\ninteresting adaptation of the work of Sandqvist for IPL to the substructural\nsetting. What is particularly interesting is how naturally the move to the\nsubstructural setting provided a semantics for the multiplicative fragment of\nintuitionistic linear logic. Whilst ultimately the Gheorghiu, Gu and Pym used\ntheir foundations to provide a semantics for bunched implication logic, it begs\nthe question, what of the rest of intuitionistic linear logic? In this paper, I\npresent just such a semantics. This is particularly of interest as this logic\nhas as a connective the bang, a modal connective. Capturing the inferentialist\ncontent of formulas marked with this connective is particularly challenging and\na discussion is dedicated to this at the end of the paper.\n","authors":["Yll Buzoku"],"pdf_url":"https://arxiv.org/pdf/2402.01982v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2405.11327v1","updated":"2024-05-18T16:07:49Z","published":"2024-05-18T16:07:49Z","title":"SMT-based Symbolic Model-Checking for Operator Precedence Languages","summary":" Operator Precedence Languages (OPL) have been recently identified as a\nsuitable formalism for model checking recursive procedural programs, thanks to\ntheir ability of modeling the program stack. OPL requirements can be expressed\nin the Precedence Oriented Temporal Logic (POTL), which features modalities to\nreason on the natural matching between function calls and returns, exceptions,\nand other advanced programming constructs that previous approaches, such as\nVisibly Pushdown Languages, cannot model effectively. Existing approaches for\nmodel checking of POTL have been designed following the explicit-state,\nautomata-based approach, a feature that severely limits their scalability. In\nthis paper, we give the first symbolic, SMT-based approach for model checking\nPOTL properties. While previous approaches construct the automaton for both the\nPOTL formula and the model of the program, we encode them into a (sequence of)\nSMT formulas. The search of a trace of the model witnessing a violation of the\nformula is then carried out by an SMT-solver, in a Bounded Model Checking\nfashion. We carried out an experimental evaluation, which shows the\neffectiveness of the proposed solution.\n","authors":["Michele Chiari","Luca Geatti","Nicola Gigante","Matteo Pradella"],"pdf_url":"https://arxiv.org/pdf/2405.11327v1.pdf","comment":"30 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.11267v1","updated":"2024-05-18T11:34:05Z","published":"2024-05-18T11:34:05Z","title":"Concurrent Games over Relational Structures: The Origin of Game Comonads","summary":" Spoiler-Duplicator games are used in finite model theory to examine the\nexpressive power of logics. Their strategies have recently been reformulated as\ncoKleisli maps of game comonads over relational structures, providing new\nresults in finite model theory via categorical techniques. We present a novel\nframework for studying Spoiler-Duplicator games by viewing them as event\nstructures. We introduce a first systematic method for constructing comonads\nfor all one-sided Spoiler-Duplicator games: game comonads are now realised by\nadjunctions to a category of games, generically constructed from a comonad in a\nbicategory of game schema (called signature games). Maps of the constructed\ncategories of games are strategies and generalise coKleisli maps of game\ncomonads; in the case of one-sided games they are shown to coincide with\nsuitably generalised homomorphisms. Finally, we provide characterisations of\nstrategies on two-sided Spoiler-Duplicator games; in a common special case they\ncoincide with spans of event structures.\n","authors":["Yoàv Montacute","Glynn Winskel"],"pdf_url":"https://arxiv.org/pdf/2405.11267v1.pdf","comment":"Extended version of the paper in Logic in Computer Science (LICS)\n 2024 Proceedings"},{"id":"http://arxiv.org/abs/2405.11308v1","updated":"2024-05-18T14:52:22Z","published":"2024-05-18T14:52:22Z","title":"Propositional dynamic logic and asynchronous cascade decompositions for\n regular trace languages","summary":" We propose a local, past-oriented fragment of propositional dynamic logic to\nreason about concurrent scenarios modelled as Mazurkiewicz traces, and prove it\nto be expressively complete with respect to regular trace languages. Because of\nlocality, specifications in this logic are efficiently translated into\nasynchronous automata, in a way that reflects the structure of formulas. In\nparticular, we obtain a new proof of Zielonka's fundamental theorem and we\nprove that any regular trace language can be implemented by a cascade product\nof localized asynchronous automata, which essentially operate on a single\nprocess.\n These results refine earlier results by Adsul et al. which involved a larger\nfragment of past propositional dynamic logic and used Mukund and Sohoni's\ngossip automaton. Our new results avoid using this automaton, or Zielonka's\ntimestamping mechanism and, in particular, they show how to implement a gossip\nautomaton as a cascade product.\n","authors":["Bharat Adsul","Paul Gastin","Shantanu Kulkarni","Pascal Weil"],"pdf_url":"https://arxiv.org/pdf/2405.11308v1.pdf","comment":"13 pages. Accepted for publication at LICS 2024"}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2405.11353v1","updated":"2024-05-18T17:44:17Z","published":"2024-05-18T17:44:17Z","title":"NTTSuite: Number Theoretic Transform Benchmarks for Accelerating\n Encrypted Computation","summary":" Privacy concerns have thrust privacy-preserving computation into the\nspotlight. Homomorphic encryption (HE) is a cryptographic system that enables\ncomputation to occur directly on encrypted data, providing users with strong\nprivacy (and security) guarantees while using the same services they enjoy\ntoday unprotected. While promising, HE has seen little adoption due to\nextremely high computational overheads, rendering it impractical. Homomorphic\nencryption (HE) is a cryptographic system that enables computation to occur\ndirectly on encrypted data. In this paper we develop a benchmark suite, named\nNTTSuite, to enable researchers to better address these overheads by studying\nthe primary source of HE's slowdown: the number theoretic transform (NTT).\nNTTSuite constitutes seven unique NTT algorithms with support for CPUs (C++),\nGPUs (CUDA), and custom hardware (Catapult HLS).In addition, we propose\noptimizations to improve the performance of NTT running on FPGAs. We find our\nimplementation outperforms the state-of-the-art by 30%.\n","authors":["Juran Ding","Yuanzhe Liu","Lingbin Sun","Brandon Reagen"],"pdf_url":"https://arxiv.org/pdf/2405.11353v1.pdf","comment":"8 pages, 5 figures, and two tables. To download the source code, see\n https://github.com/Dragon201701/NTTSuite"},{"id":"http://arxiv.org/abs/2405.13043v1","updated":"2024-05-18T08:03:31Z","published":"2024-05-18T08:03:31Z","title":"Towards Specialized Supercomputers for Climate Sciences: Computational\n Requirements of the Icosahedral Nonhydrostatic Weather and Climate Model","summary":" We discuss the computational challenges and requirements for high-resolution\nclimate simulations using the Icosahedral Nonhydrostatic Weather and Climate\nModel (ICON). We define a detailed requirements model for ICON which emphasizes\nthe need for specialized supercomputers to accurately predict climate change\nimpacts and extreme weather events. Based on the requirements model, we outline\ncomputational demands for km-scale simulations, and suggests machine learning\ntechniques to enhance model accuracy and efficiency. Our findings aim to guide\nthe design of future supercomputers for advanced climate science.\n","authors":["Torsten Hoefler","Alexandru Calotoiu","Anurag Dipankar","Thomas Schulthess","Xavier Lapillonne","Oliver Fuhrer"],"pdf_url":"https://arxiv.org/pdf/2405.13043v1.pdf","comment":null}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2405.11361v1","updated":"2024-05-18T18:13:31Z","published":"2024-05-18T18:13:31Z","title":"An Opportunistically Parallel Lambda Calculus for Performant Composition\n of Large Language Models","summary":" Large language models (LLMs) have shown impressive results at a wide-range of\ntasks. However, they have limitations, such as hallucinating facts and\nstruggling with arithmetic. Recent work has addressed these issues with\nsophisticated decoding techniques. However, performant decoding, particularly\nfor sophisticated techniques, relies crucially on parallelization and batching,\nwhich are difficult for developers.\n We make two observations: 1) existing approaches are high-level\ndomain-specific languages for gluing expensive black-box calls, but are not\ngeneral or compositional; 2) LLM programs are essentially pure (all effects\ncommute). Guided by these observations, we develop a novel, general-purpose\nlambda calculus for automatically parallelizing a wide-range of LLM\ninteractions, without user intervention. The key difference versus standard\nlambda calculus is a novel \"opportunistic\" evaluation strategy, which steps\nindependent parts of a program in parallel, dispatching black-box external\ncalls as eagerly as possible, even while data-independent parts of the program\nare waiting for their own external calls to return. To maintain the simplicity\nof the language and to ensure uniformity of opportunistic evaluation,\ncontrol-flow and looping constructs are implemented in-language, via Church\nencodings.\n We implement this approach in a framework called EPIC, embedded in--and\ninteroperating closely with--Python. We demonstrate its versatility and\nperformance with three case studies drawn from the machine learning literature:\nTree-of-Thoughts (LLMs embedded in classic search procedures), nested tool use,\nand constrained decoding. Our experiments show that opportunistic evaluation\noffers a $1.5\\times$ to $4.8\\times$ speedup over sequential evaluation, while\nstill allowing practitioners to write straightforward and composable programs,\nwithout any manual parallelism or batching.\n","authors":["Stephen Mell","Steve Zdancewic","Osbert Bastani"],"pdf_url":"https://arxiv.org/pdf/2405.11361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11267v1","updated":"2024-05-18T11:34:05Z","published":"2024-05-18T11:34:05Z","title":"Concurrent Games over Relational Structures: The Origin of Game Comonads","summary":" Spoiler-Duplicator games are used in finite model theory to examine the\nexpressive power of logics. Their strategies have recently been reformulated as\ncoKleisli maps of game comonads over relational structures, providing new\nresults in finite model theory via categorical techniques. We present a novel\nframework for studying Spoiler-Duplicator games by viewing them as event\nstructures. We introduce a first systematic method for constructing comonads\nfor all one-sided Spoiler-Duplicator games: game comonads are now realised by\nadjunctions to a category of games, generically constructed from a comonad in a\nbicategory of game schema (called signature games). Maps of the constructed\ncategories of games are strategies and generalise coKleisli maps of game\ncomonads; in the case of one-sided games they are shown to coincide with\nsuitably generalised homomorphisms. Finally, we provide characterisations of\nstrategies on two-sided Spoiler-Duplicator games; in a common special case they\ncoincide with spans of event structures.\n","authors":["Yoàv Montacute","Glynn Winskel"],"pdf_url":"https://arxiv.org/pdf/2405.11267v1.pdf","comment":"Extended version of the paper in Logic in Computer Science (LICS)\n 2024 Proceedings"},{"id":"http://arxiv.org/abs/2405.11244v1","updated":"2024-05-18T10:05:31Z","published":"2024-05-18T10:05:31Z","title":"Strided Difference Bound Matrices","summary":" A wide range of symbolic analysis and optimization problems can be formalized\nusing polyhedra. Sub-classes of polyhedra, also known as sub-polyhedral\ndomains, are sought for their lower space and time complexity. We introduce the\nStrided Difference Bound Matrix (SDBM) domain, which represents a sweet spot in\nthe context of optimizing compilers. Its expressiveness and efficient\nalgorithms are particularly well suited to the construction of machine learning\ncompilers. We present decision algorithms, abstract domain operators and\ncomputational complexity proofs for SDBM. We also conduct an empirical study\nwith the MLIR compiler framework to validate the domain's practical\napplicability. We characterize a sub-class of SDBMs that frequently occurs in\npractice, and demonstrate even faster algorithms on this sub-class.\n","authors":["Arjun Pitchanathan","Albert Cohen","Oleksandr Zinenko","Tobias Grosser"],"pdf_url":"https://arxiv.org/pdf/2405.11244v1.pdf","comment":"Preprint and extended from the CAV 2024 conference version"},{"id":"http://arxiv.org/abs/2405.11128v1","updated":"2024-05-18T00:07:26Z","published":"2024-05-18T00:07:26Z","title":"Parsimonious Optimal Dynamic Partial Order Reduction","summary":" Stateless model checking is a fully automatic verification technique for\nconcurrent programs that checks for safety violations by exploring all possible\nthread schedulings. It becomes effective when coupled with Dynamic Partial\nOrder Reduction (DPOR), which introduces an equivalence on schedulings and\nreduces the amount of needed exploration. DPOR algorithms that are\n\\emph{optimal} are particularly effective in that they guarantee to explore\n\\emph{exactly} one execution from each equivalence class. Unfortunately,\nexisting sequence-based optimal algorithms may in the worst case consume memory\nthat is exponential in the size of the analyzed program. In this paper, we\npresent Parsimonious-OPtimal (POP) DPOR, an optimal DPOR algorithm for\nanalyzing multi-threaded programs under sequential consistency, whose space\nconsumption is polynomial in the worst case. POP combines several novel\nalgorithmic techniques, including (i) a parsimonious race reversal strategy,\nwhich avoids multiple reversals of the same race, (ii) an eager race reversal\nstrategy to avoid storing initial fragments of to-be-explored executions, and\n(iii) a space-efficient scheme for preventing redundant exploration, which\nreplaces the use of sleep sets. Our implementation in Nidhugg shows that these\ntechniques can significantly speed up the analysis of concurrent programs, and\ndo so with low memory consumption. Comparison to a related optimal DPOR\nalgorithm for a different representation of concurrent executions as graphs\nshows that POP has comparable worst-case performance for smaller benchmarks and\noutperforms the other one for larger programs.\n","authors":["Parosh Aziz Abdulla","Mohamed Faouzi Atig","Sarbojit Das","Bengt Jonsson","Konstantinos Sagonas"],"pdf_url":"https://arxiv.org/pdf/2405.11128v1.pdf","comment":null}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2405.11308v1","updated":"2024-05-18T14:52:22Z","published":"2024-05-18T14:52:22Z","title":"Propositional dynamic logic and asynchronous cascade decompositions for\n regular trace languages","summary":" We propose a local, past-oriented fragment of propositional dynamic logic to\nreason about concurrent scenarios modelled as Mazurkiewicz traces, and prove it\nto be expressively complete with respect to regular trace languages. Because of\nlocality, specifications in this logic are efficiently translated into\nasynchronous automata, in a way that reflects the structure of formulas. In\nparticular, we obtain a new proof of Zielonka's fundamental theorem and we\nprove that any regular trace language can be implemented by a cascade product\nof localized asynchronous automata, which essentially operate on a single\nprocess.\n These results refine earlier results by Adsul et al. which involved a larger\nfragment of past propositional dynamic logic and used Mukund and Sohoni's\ngossip automaton. Our new results avoid using this automaton, or Zielonka's\ntimestamping mechanism and, in particular, they show how to implement a gossip\nautomaton as a cascade product.\n","authors":["Bharat Adsul","Paul Gastin","Shantanu Kulkarni","Pascal Weil"],"pdf_url":"https://arxiv.org/pdf/2405.11308v1.pdf","comment":"13 pages. Accepted for publication at LICS 2024"}]},"2024-05-19T00:00:00Z":{"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2404.19634v3","updated":"2024-05-19T13:36:57Z","published":"2024-04-30T15:35:25Z","title":"DF Louvain: Fast Incrementally Expanding Approach for Community\n Detection on Dynamic Graphs","summary":" Community detection is the problem of recognizing natural divisions in\nnetworks. A relevant challenge in this problem is to find communities on\nrapidly evolving graphs. In this report we present our Parallel Dynamic\nFrontier (DF) Louvain algorithm, which given a batch update of edge deletions\nand insertions, incrementally identifies and processes an approximate set of\naffected vertices in the graph with minimal overhead, while using a novel\napproach of incrementally updating weighted-degrees of vertices and total edge\nweights of communities. We also present our parallel implementations of\nNaive-dynamic (ND) and Delta-screening (DS) Louvain. On a server with a 64-core\nAMD EPYC-7742 processor, our experiments show that DF Louvain obtains speedups\nof 179x, 7.2x, and 5.3x on real-world dynamic graphs, compared to Static, ND,\nand DS Louvain, respectively, and is 183x, 13.8x, and 8.7x faster,\nrespectively, on large graphs with random batch updates. Moreover, DF Louvain\nimproves its performance by 1.6x for every doubling of threads.\n","authors":["Subhajit Sahu"],"pdf_url":"https://arxiv.org/pdf/2404.19634v3.pdf","comment":"22 pages, 15 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.11667v1","updated":"2024-05-19T20:20:03Z","published":"2024-05-19T20:20:03Z","title":"The Limits and Potentials of Local SGD for Distributed Heterogeneous\n Learning with Intermittent Communication","summary":" Local SGD is a popular optimization method in distributed learning, often\noutperforming other algorithms in practice, including mini-batch SGD. Despite\nthis success, theoretically proving the dominance of local SGD in settings with\nreasonable data heterogeneity has been difficult, creating a significant gap\nbetween theory and practice. In this paper, we provide new lower bounds for\nlocal SGD under existing first-order data heterogeneity assumptions, showing\nthat these assumptions are insufficient to prove the effectiveness of local\nupdate steps. Furthermore, under these same assumptions, we demonstrate the\nmin-max optimality of accelerated mini-batch SGD, which fully resolves our\nunderstanding of distributed optimization for several problem classes. Our\nresults emphasize the need for better models of data heterogeneity to\nunderstand the effectiveness of local SGD in practice. Towards this end, we\nconsider higher-order smoothness and heterogeneity assumptions, providing new\nupper bounds that imply the dominance of local SGD over mini-batch SGD when\ndata heterogeneity is low.\n","authors":["Kumar Kshitij Patel","Margalit Glasgow","Ali Zindari","Lingxiao Wang","Sebastian U. Stich","Ziheng Cheng","Nirmit Joshi","Nathan Srebro"],"pdf_url":"https://arxiv.org/pdf/2405.11667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11658v1","updated":"2024-05-19T20:10:55Z","published":"2024-05-19T20:10:55Z","title":"A Starting Point for Dynamic Community Detection with Leiden Algorithm","summary":" Many real-world graphs evolve with time. Identifying communities or clusters\non such graphs is an important problem. In this technical report, we extend\nthree dynamic approaches, namely, Naive-dynamic (ND), Delta-screening (DS), and\nDynamic Frontier (DF), to our multicore implementation of the Leiden algorithm,\nan algorithm known for its high-quality community detection. Our experiments on\na server with a 64-core AMD EPYC-7742 processor demonstrate that ND, DS, and DF\nLeiden achieve speedups of 1.25x, 1.24x, and 1.37x on large graphs with random\nbatch updates, compared to Static, ND, and DS Leiden, respectively. However, on\nreal-world dynamic graphs, ND Leiden performs the best, being on average 1.14x\nfaster than Static Leiden. We hope our early results serve as a starting point\nfor dynamic approaches to the Leiden algorithm on evolving graphs.\n","authors":["Subhajit Sahu"],"pdf_url":"https://arxiv.org/pdf/2405.11658v1.pdf","comment":"13 pages, 5 figures, 2 tables. arXiv admin note: substantial text\n overlap with arXiv:2404.19634"},{"id":"http://arxiv.org/abs/2405.11608v1","updated":"2024-05-19T16:36:16Z","published":"2024-05-19T16:36:16Z","title":"Full private delegated quantum computing tailored from user to industry","summary":" In this paper, we present a set of private and secure delegated quantum\ncomputing protocols and techniques tailored to user-level and industry-level\nuse cases, depending on the computational resources available to the client,\nthe specific privacy needs required, and the type of algorithm. Our protocols\nare presented at a high level as they are independent of the particular\nalgorithm used for such encryption and decryption processes. Additionally, we\npropose a method to verify the correct execution of operations by the external\nserver.\n","authors":["Alejandro Mata Ali","Adriano Mauricio Lusso","Edgar Mencia"],"pdf_url":"https://arxiv.org/pdf/2405.11608v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.11580v1","updated":"2024-05-19T15:15:18Z","published":"2024-05-19T15:15:18Z","title":"Securing Health Data on the Blockchain: A Differential Privacy and\n Federated Learning Framework","summary":" This study proposes a framework to enhance privacy in Blockchain-based\nInternet of Things (BIoT) systems used in the healthcare sector. The framework\naddresses the challenge of leveraging health data for analytics while\nprotecting patient privacy. To achieve this, the study integrates Differential\nPrivacy (DP) with Federated Learning (FL) to protect sensitive health data\ncollected by IoT nodes. The proposed framework utilizes dynamic personalization\nand adaptive noise distribution strategies to balance privacy and data utility.\nAdditionally, blockchain technology ensures secure and transparent aggregation\nand storage of model updates. Experimental results on the SVHN dataset\ndemonstrate that the proposed framework achieves strong privacy guarantees\nagainst various attack scenarios while maintaining high accuracy in health\nanalytics tasks. For 15 rounds of federated learning with an epsilon value of\n8.0, the model obtains an accuracy of 64.50%. The blockchain integration,\nutilizing Ethereum, Ganache, Web3.py, and IPFS, exhibits an average transaction\nlatency of around 6 seconds and consistent gas consumption across rounds,\nvalidating the practicality and feasibility of the proposed approach.\n","authors":["Daniel Commey","Sena Hounsinou","Garth V. Crosby"],"pdf_url":"https://arxiv.org/pdf/2405.11580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07297v3","updated":"2024-05-19T13:52:48Z","published":"2023-07-14T12:20:32Z","title":"Game Dynamics and Equilibrium Computation in the Population Protocol\n Model","summary":" We initiate the study of game dynamics in the population protocol model: $n$\nagents each maintain a current local strategy and interact in pairs uniformly\nat random. Upon each interaction, the agents play a two-person game and receive\na payoff from an underlying utility function, and they can subsequently update\ntheir strategies according to a fixed local algorithm. In this setting, we ask\nhow the distribution over agent strategies evolves over a sequence of\ninteractions, and we introduce a new distributional equilibrium concept to\nquantify the quality of such distributions. As an initial example, we study a\nclass of repeated prisoner's dilemma games, and we consider a family of simple\nlocal update algorithms that yield non-trivial dynamics over the distribution\nof agent strategies. We show that these dynamics are related to a new class of\nhigh-dimensional Ehrenfest random walks, and we derive exact characterizations\nof their stationary distributions, bounds on their mixing times, and prove\ntheir convergence to approximate distributional equilibria. Our results\nhighlight trade-offs between the local state space of each agent, and the\nconvergence rate and approximation factor of the underlying dynamics. Our\napproach opens the door towards the further characterization of equilibrium\ncomputation for other classes of games and dynamics in the population setting.\n","authors":["Dan Alistarh","Krishnendu Chatterjee","Mehrdad Karrabi","John Lazarsfeld"],"pdf_url":"https://arxiv.org/pdf/2307.07297v3.pdf","comment":"To appear in PODC 2024"},{"id":"http://arxiv.org/abs/2311.02650v2","updated":"2024-05-19T10:16:07Z","published":"2023-11-05T13:37:15Z","title":"Ephemeral Rollups are All you Need","summary":" In the realm of open and composable gaming, we envision platforms where users\nactively expand, create, engage, and immerse themselves in a rich world of\nentertainment. One promising avenue for achieving this vision is through fully\non-chain (FOC) games, where both game state and logic reside on the blockchain,\nmaximizing composability. However, we must grapple with inherent limitations\nand trade-offs, particularly in terms of costs and scalability. This paper\nproposes BOLT, a framework that leverages the Solana Virtual Machine (SVM) to\nscale FOC games without state fragmentation or compromised trust assumptions.\nThe framework introduces a systematic approach for discovering, utilizing, and\npublishing modular pieces of logic as components deeply rooted in the\nEntity-Component-System (ECS) pattern. To enhance scalability and resource\noptimization, we introduce the concept of Ephemeral Rollups (ERs) that overcome\nthe tradeoffs of L2s horizontal scaling. These dedicated runtimes can be\ncustomized to provide higher operational speed, configurable ticking\nmechanisms, provable sessions and gasless transactions without\ncomposability-scalability tradeoffs.\n","authors":["Gabriele Picco","Andrea Fortugno"],"pdf_url":"https://arxiv.org/pdf/2311.02650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11440v1","updated":"2024-05-19T04:23:40Z","published":"2024-05-19T04:23:40Z","title":"A GAN-Based Data Poisoning Attack Against Federated Learning Systems and\n Its Countermeasure","summary":" As a distributed machine learning paradigm, federated learning (FL) is\ncollaboratively carried out on privately owned datasets but without direct data\naccess. Although the original intention is to allay data privacy concerns,\n\"available but not visible\" data in FL potentially brings new security threats,\nparticularly poisoning attacks that target such \"not visible\" local data.\nInitial attempts have been made to conduct data poisoning attacks against FL\nsystems, but cannot be fully successful due to their high chance of causing\nstatistical anomalies. To unleash the potential for truly \"invisible\" attacks\nand build a more deterrent threat model, in this paper, a new data poisoning\nattack model named VagueGAN is proposed, which can generate seemingly\nlegitimate but noisy poisoned data by untraditionally taking advantage of\ngenerative adversarial network (GAN) variants. Capable of manipulating the\nquality of poisoned data on demand, VagueGAN enables to trade-off attack\neffectiveness and stealthiness. Furthermore, a cost-effective countermeasure\nnamed Model Consistency-Based Defense (MCD) is proposed to identify\nGAN-poisoned data or models after finding out the consistency of GAN outputs.\nExtensive experiments on multiple datasets indicate that our attack method is\ngenerally much more stealthy as well as more effective in degrading FL\nperformance with low complexity. Our defense method is also shown to be more\ncompetent in identifying GAN-poisoned data or models. The source codes are\npublicly available at\n\\href{https://github.com/SSssWEIssSS/VagueGAN-Data-Poisoning-Attack-and-Its-Countermeasure}{https://github.com/SSssWEIssSS/VagueGAN-Data-Poisoning-Attack-and-Its-Countermeasure}.\n","authors":["Wei Sun","Bo Gao","Ke Xiong","Yuwei Wang","Pingyi Fan","Khaled Ben Letaief"],"pdf_url":"https://arxiv.org/pdf/2405.11440v1.pdf","comment":"18 pages, 16 figures"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2405.06505v3","updated":"2024-05-19T19:02:48Z","published":"2024-05-10T14:37:24Z","title":"Hal: A Language-General Framework for Analysis of User-Specified\n Monotone Frameworks [DRAFT]","summary":" Writing dataflow analyzers requires both language and domain-specificity.\nThat is to say, each programming language and each program property requires\nits own analyzer. To enable a streamlined, user-driven approach to dataflow\nanalyzers, we introduce the theoretical framework for a user-specified dataflow\nanalysis. This framework is constructed in such a way that the user has to\nspecify as little as possible, while the analyzer infers and computes\neverything else, including interprocedural embellishments. This theoretical\nframework was also implemented in Java, where users can specify a program\nproperty alongside minimal extra information to induce a dataflow analysis.\nThis framework (both theoretical and in implementation) is language-general,\nmeaning that it is independent of syntax and semantics (as all necessary\nsyntactic and semantic information is provided by the user, and this\ninformation is provided only once for a given language). In this paper, we\nintroduce basic notions of intraprocedural and interprocedural dataflow\nanalyses, the proposed \"Implicit Monotone Framework,\" and a rigorous framework\nfor partial functions as a property space.\n","authors":["Abdullah Rasheed"],"pdf_url":"https://arxiv.org/pdf/2405.06505v3.pdf","comment":"Undergraduate Senior Capstone Project"},{"id":"http://arxiv.org/abs/2405.11535v1","updated":"2024-05-19T12:42:39Z","published":"2024-05-19T12:42:39Z","title":"Proving Functional Program Equivalence via Directed Lemma Synthesis","summary":" Proving equivalence between functional programs is a fundamental problem in\nprogram verification, which often amounts to reasoning about algebraic data\ntypes (ADTs) and compositions of structural recursions. Modern theorem provers\naddress this problem by applying structural induction, which is insufficient\nfor proving many equivalence theorems. In such cases, one has to invent a set\nof lemmas, prove these lemmas by additional induction, and use these lemmas to\nprove the original theorem. There is, however, a lack of systematic\nunderstanding of what lemmas are needed for inductive proofs and how these\nlemmas can be synthesized automatically. This paper presents directed lemma\nsynthesis, an effective approach to automating equivalence proofs by\ndiscovering critical lemmas using program synthesis techniques. We first\nidentify two induction-friendly forms of propositions that give formal\nguarantees to the progress of the proof. We then propose two tactics that\nsynthesize and apply lemmas, thereby transforming the proof goal into\ninduction-friendly forms. Both tactics reduce lemma synthesis to a specialized\nclass of program synthesis problems with efficient algorithms. Experimental\nresults demonstrate the effectiveness of our approach: Compared to\nstate-of-the-art equivalence checkers employing heuristic-based lemma\nenumeration, directed lemma synthesis saves 95.47% runtime on average and\nsolves 38 more tasks over an extended version of the standard benchmark set.\n","authors":["Yican Sun","Ruyi Ji","Jian Fang","Xuanlin Jiang","Mingshuai Chen","Yingfei Xiong"],"pdf_url":"https://arxiv.org/pdf/2405.11535v1.pdf","comment":"21 pages"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2402.18090v3","updated":"2024-05-19T06:01:38Z","published":"2024-02-28T06:22:24Z","title":"Computing Minimal Absent Words and Extended Bispecial Factors with CDAWG\n Space","summary":" A string $w$ is said to be a minimal absent word (MAW) for a string $S$ if\n$w$ does not occur in $S$ and any proper substring of $w$ occurs in $S$. We\nfocus on non-trivial MAWs which are of length at least 2. Finding such\nnon-trivial MAWs for a given string is motivated for applications in\nbioinformatics and data compression. Fujishige et al. [TCS 2023] proposed a\ndata structure of size $\\Theta(n)$ that can output the set $\\mathsf{MAW}(S)$ of\nall MAWs for a given string $S$ of length $n$ in $O(n + |\\mathsf{MAW}(S)|)$\ntime, based on the directed acyclic word graph (DAWG). In this paper, we\npresent a more space efficient data structure based on the compact DAWG\n(CDAWG), which can output $\\mathsf{MAW}(S)$ in $O(|\\mathsf{MAW}(S)|)$ time with\n$O(\\mathsf{e}_\\min)$ space, where $\\mathsf{e}_\\min$ denotes the minimum of the\nsizes of the CDAWGs for $S$ and for its reversal $S^R$. For any strings of\nlength $n$, it holds that $\\mathsf{e}_\\min < 2n$, and for highly repetitive\nstrings $\\mathsf{e}_\\min$ can be sublinear (up to logarithmic) in $n$. We also\nshow that MAWs and their generalization minimal rare words have close\nrelationships with extended bispecial factors, via the CDAWG.\n","authors":["Shunsuke Inenaga","Takuya Mieno","Hiroki Arimura","Mitsuru Funakoshi","Yuta Fujishige"],"pdf_url":"https://arxiv.org/pdf/2402.18090v3.pdf","comment":"Accepted for IWOCA 2024"},{"id":"http://arxiv.org/abs/2405.11657v1","updated":"2024-05-19T20:06:38Z","published":"2024-05-19T20:06:38Z","title":"On the Expressivity of Recurrent Neural Cascades with Identity","summary":" Recurrent Neural Cascades (RNC) are the class of recurrent neural networks\nwith no cyclic dependencies among recurrent neurons. Their subclass RNC+ with\npositive recurrent weights has been shown to be closely connected to the\nstar-free regular languages, which are the expressivity of many\nwell-established temporal logics. The existing expressivity results show that\nthe regular languages captured by RNC+ are the star-free ones, and they leave\nopen the possibility that RNC+ may capture languages beyond regular. We exclude\nthis possibility for languages that include an identity element, i.e., an input\nthat can occur an arbitrary number of times without affecting the output.\nNamely, in the presence of an identity element, we show that the languages\ncaptured by RNC+ are exactly the star-free regular languages. Identity elements\nare ubiquitous in temporal patterns, and hence our results apply to a large\nnumber of applications. The implications of our results go beyond expressivity.\nAt their core, we establish a close structural correspondence between RNC+ and\nsemiautomata cascades, showing that every neuron can be equivalently captured\nby a three-state semiautomaton. A notable consequence of this result is that\nRNC+ are no more succinct than cascades of three-state semiautomata.\n","authors":["Nadezda A. Knorozova","Alessandro Ronca"],"pdf_url":"https://arxiv.org/pdf/2405.11657v1.pdf","comment":null}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2405.11607v1","updated":"2024-05-19T16:27:21Z","published":"2024-05-19T16:27:21Z","title":"OFHE: An Electro-Optical Accelerator for Discretized TFHE","summary":" This paper presents \\textit{OFHE}, an electro-optical accelerator designed to\nprocess Discretized TFHE (DTFHE) operations, which encrypt multi-bit messages\nand support homomorphic multiplications, lookup table operations and\nfull-domain functional bootstrappings. While DTFHE is more efficient and\nversatile than other fully homomorphic encryption schemes, it requires 32-,\n64-, and 128-bit polynomial multiplications, which can be time-consuming.\nExisting TFHE accelerators are not easily upgradable to support DTFHE\noperations due to limited datapaths, a lack of datapath bit-width\nreconfigurability, and power inefficiencies when processing FFT and inverse FFT\n(IFFT) kernels. Compared to prior TFHE accelerators, OFHE addresses these\nchallenges by improving the DTFHE operation latency by 8.7\\%, the DTFHE\noperation throughput by $57\\%$, and the DTFHE operation throughput per Watt by\n$94\\%$.\n","authors":["Mengxin Zheng","Cheng Chu","Qian Lou","Nathan Youngblood","Mo Li","Sajjad Moazeni","Lei Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.11607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11425v1","updated":"2024-05-19T02:11:30Z","published":"2024-05-19T02:11:30Z","title":"Enabling full-speed random access to the entire memory on the A100 GPU","summary":" We describe some features of the A100 memory architecture. In particular, we\ngive a technique to reverse-engineer some hardware layout information. Using\nthis information, we show how to avoid TLB issues to obtain full-speed random\nHBM access to the entire memory, as long as we constrain any particular thread\nto a reduced access window of less than 64GB.\n","authors":["Alden Walker"],"pdf_url":"https://arxiv.org/pdf/2405.11425v1.pdf","comment":"6 pages, 6 figures"}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2405.11425v1","updated":"2024-05-19T02:11:30Z","published":"2024-05-19T02:11:30Z","title":"Enabling full-speed random access to the entire memory on the A100 GPU","summary":" We describe some features of the A100 memory architecture. In particular, we\ngive a technique to reverse-engineer some hardware layout information. Using\nthis information, we show how to avoid TLB issues to obtain full-speed random\nHBM access to the entire memory, as long as we constrain any particular thread\nto a reduced access window of less than 64GB.\n","authors":["Alden Walker"],"pdf_url":"https://arxiv.org/pdf/2405.11425v1.pdf","comment":"6 pages, 6 figures"}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2405.11699v1","updated":"2024-05-19T23:31:04Z","published":"2024-05-19T23:31:04Z","title":"Fixed-parameter tractability of canonical polyadic decomposition over\n finite fields","summary":" We present a simple proof that finding a rank-$R$ canonical polyadic\ndecomposition of 3-dimensional tensors over a finite field $\\mathbb{F}$ is\nfixed-parameter tractable with respect to $R$ and $\\mathbb{F}$. We also show\nsome more concrete upper bounds on the time complexity of this problem.\n","authors":["Jason Yang"],"pdf_url":"https://arxiv.org/pdf/2405.11699v1.pdf","comment":"8 pages; some proofs copied from arXiv:2401.06857"},{"id":"http://arxiv.org/abs/2306.10600v3","updated":"2024-05-19T17:30:09Z","published":"2023-06-18T17:17:34Z","title":"A Smoothed FPTAS for Equilibria in Congestion Games","summary":" We present a fully polynomial-time approximation scheme (FPTAS) for computing\nequilibria in congestion games, under smoothed running-time analysis. More\nprecisely, we prove that if the resource costs of a congestion game are\nrandomly perturbed by independent noises, whose density is at most $\\phi$, then\nany sequence of $(1+\\varepsilon)$-improving dynamics will reach an\n$(1+\\varepsilon)$-approximate pure Nash equilibrium (PNE) after an expected\nnumber of steps which is strongly polynomial in $\\frac{1}{\\varepsilon}$,\n$\\phi$, and the size of the game's description. Our results establish a sharp\ncontrast to the traditional worst-case analysis setting, where it is known that\nbetter-response dynamics take exponentially long to converge to\n$\\alpha$-approximate PNE, for any constant factor $\\alpha\\geq 1$. As a matter\nof fact, computing $\\alpha$-approximate PNE in congestion games is PLS-hard.\n We demonstrate how our analysis can be applied to various different models of\ncongestion games including general, step-function, and polynomial cost, as well\nas fair cost-sharing games (where the resource costs are decreasing). It is\nimportant to note that our bounds do not depend explicitly on the cardinality\nof the players' strategy sets, and thus the smoothed FPTAS is readily\napplicable to network congestion games as well.\n","authors":["Yiannis Giannakopoulos"],"pdf_url":"https://arxiv.org/pdf/2306.10600v3.pdf","comment":"To appear at EC'24. Simplified analysis and improved bound in Lemma\n 1. Improved bound at Eq. (11). These result in improved smoothed running time\n bounds for all our congestion game models (i.e. Sections 3.2, 3.3.1, 3.3.2,\n and 3.3.3)"},{"id":"http://arxiv.org/abs/2303.14405v5","updated":"2024-05-19T14:26:35Z","published":"2023-03-25T08:58:11Z","title":"On the Efficiency of An Election Game of Two or More Parties: How Bad\n Can It Be?","summary":" An election campaign among two or more parties can be viewed as a game of two\nor more players, each of which has its own candidates as the pure strategies.\nPeople, as voters, comprise supporters for each party, and a candidate brings\nutility for the supporters of each party. Each party nominates exactly one of\nits candidates to compete against the other party's. A candidate is assumed to\nwin the election with greater or equal odds if it brings more utility for all\nthe people. The payoff of each player is the expected utility that its\nsupporters get. The game is egoistic if every candidate benefits its party's\nsupporters more than any candidate from a competing party does. In this paper,\nwe first prove that it is NP-complete to determine whether an election game in\na succinct representation, which is called the general form, has a\npure-strategy Nash equilibrium even if it is egoistic. Next, we propose a\nfixed-parameter tractable algorithm to compute a pure-strategy Nash equilibrium\nof an egoistic election game and show that a naive constant time algorithm\nleads to a (1+e)-approximate pure-strategy Nash equilibrium when the winning\nprobability is computed by a softmax function. Finally, perhaps surprisingly,\nwe show that the price of anarchy for egoistic election games is upper bounded\nby the number of parties. Our results suggest that an election becomes\nunpredictable in terms of stability and efficiency when more than two parties\nare involved, and, to some extent, also provides supporting arguments for why\nthe two-party system is prevalent in democratic countries.\n","authors":["Chuang-Chieh Lin","Chi-Jen Lu","Po-An Chen"],"pdf_url":"https://arxiv.org/pdf/2303.14405v5.pdf","comment":"A previous version appeared at the 6th Games, Agents, and Incentives\n Workshop (GAIW-24). The current version has been submitted to SAGT 2024"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2405.11657v1","updated":"2024-05-19T20:06:38Z","published":"2024-05-19T20:06:38Z","title":"On the Expressivity of Recurrent Neural Cascades with Identity","summary":" Recurrent Neural Cascades (RNC) are the class of recurrent neural networks\nwith no cyclic dependencies among recurrent neurons. Their subclass RNC+ with\npositive recurrent weights has been shown to be closely connected to the\nstar-free regular languages, which are the expressivity of many\nwell-established temporal logics. The existing expressivity results show that\nthe regular languages captured by RNC+ are the star-free ones, and they leave\nopen the possibility that RNC+ may capture languages beyond regular. We exclude\nthis possibility for languages that include an identity element, i.e., an input\nthat can occur an arbitrary number of times without affecting the output.\nNamely, in the presence of an identity element, we show that the languages\ncaptured by RNC+ are exactly the star-free regular languages. Identity elements\nare ubiquitous in temporal patterns, and hence our results apply to a large\nnumber of applications. The implications of our results go beyond expressivity.\nAt their core, we establish a close structural correspondence between RNC+ and\nsemiautomata cascades, showing that every neuron can be equivalently captured\nby a three-state semiautomaton. A notable consequence of this result is that\nRNC+ are no more succinct than cascades of three-state semiautomata.\n","authors":["Nadezda A. Knorozova","Alessandro Ronca"],"pdf_url":"https://arxiv.org/pdf/2405.11657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06817v2","updated":"2024-05-19T11:43:18Z","published":"2024-03-11T15:34:57Z","title":"Are Targeted Messages More Effective?","summary":" Graph neural networks (GNN) are deep learning architectures for graphs.\nEssentially, a GNN is a distributed message passing algorithm, which is\ncontrolled by parameters learned from data. It operates on the vertices of a\ngraph: in each iteration, vertices receive a message on each incoming edge,\naggregate these messages, and then update their state based on their current\nstate and the aggregated messages. The expressivity of GNNs can be\ncharacterised in terms of certain fragments of first-order logic with counting\nand the Weisfeiler-Lehman algorithm.\n The core GNN architecture comes in two different versions. In the first\nversion, a message only depends on the state of the source vertex, whereas in\nthe second version it depends on the states of the source and target vertices.\nIn practice, both of these versions are used, but the theory of GNNs so far\nmostly focused on the first one. On the logical side, the two versions\ncorrespond to two fragments of first-order logic with counting that we call\nmodal and guarded.\n The question whether the two versions differ in their expressivity has been\nmostly overlooked in the GNN literature and has only been asked recently\n(Grohe, LICS'23). We answer this question here. It turns out that the answer is\nnot as straightforward as one might expect. By proving that the modal and\nguarded fragment of first-order logic with counting have the same expressivity\nover labelled undirected graphs, we show that in a non-uniform setting the two\nGNN versions have the same expressivity. However, we also prove that in a\nuniform setting the second version is strictly more expressive.\n","authors":["Martin Grohe","Eran Rosenbluth"],"pdf_url":"https://arxiv.org/pdf/2403.06817v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11418v1","updated":"2024-05-19T00:51:51Z","published":"2024-05-19T00:51:51Z","title":"Completeness of two fragments of a logic for conditional strategic\n reasoning","summary":" Classical logics for strategic reasoning, such as Coalition Logic and\nAlternating-time Temporal Logic, formalize absolute strategic reasoning about\nthe unconditional strategic abilities of agents to achieve their goals. Goranko\nand Ju introduced a logic ConStR for strategic reasoning about conditional\nstrategic abilities. However, its completeness is still an open problem. ConStR\nhas three featured operators, and one of them has the following reading: For\nsome action of A that guarantees the achievement of her goal, B has an action\nto guarantee the achievement of his goal. The logic about this operator is\ncalled CConStR. In this paper, we prove completeness for two fragments of\nCConStR. The key notions of our proof approach include downward validity lemma,\ngrafted models, and upward derivability lemma. The proof approach has good\npotential to be applied to the completeness of ConStR and other logics.\n","authors":["Yinfeng Li","Fengkui Ju"],"pdf_url":"https://arxiv.org/pdf/2405.11418v1.pdf","comment":null}]},"2024-05-20T00:00:00Z":{"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2405.07975v2","updated":"2024-05-20T16:40:35Z","published":"2024-05-13T17:48:45Z","title":"Dynamic Programming for Symbolic Boolean Realizability and Synthesis","summary":" Inspired by recent progress in dynamic programming approaches for weighted\nmodel counting, we investigate a dynamic-programming approach in the context of\nboolean realizability and synthesis, which takes a conjunctive-normal-form\nboolean formula over input and output variables, and aims at synthesizing\nwitness functions for the output variables in terms of the inputs. We show how\ngraded project-join trees, obtained via tree decomposition, can be used to\ncompute a BDD representing the realizability set for the input formulas in a\nbottom-up order. We then show how the intermediate BDDs generated during\nrealizability checking phase can be applied to synthesizing the witness\nfunctions in a top-down manner. An experimental evaluation of a solver --\nDPSynth -- based on these ideas demonstrates that our approach for Boolean\nrealizabilty and synthesis has superior time and space performance over a\nheuristics-based approach using same symbolic representations. We discuss the\nadvantage on scalability of the new approach, and also investigate our findings\non the performance of the DP framework.\n","authors":["Yi Lin","Lucas M. Tabajara","Moshe Y. Vardi"],"pdf_url":"https://arxiv.org/pdf/2405.07975v2.pdf","comment":"32 pages including appendices and bibliography, 5 figures, paper is\n to be published in CAV 2024, but this version is inclusive of the Appendix"},{"id":"http://arxiv.org/abs/2404.07823v2","updated":"2024-05-20T08:39:51Z","published":"2024-04-11T15:08:11Z","title":"Learning Deterministic Multi-Clock Timed Automata","summary":" We present an algorithm for active learning of deterministic timed automata\nwith multiple clocks. The algorithm is within the querying framework of\nAngluin's $L^*$ algorithm and follows the idea proposed in existing work on the\nactive learning of deterministic one-clock timed automata. We introduce an\nequivalence relation over the reset-clocked language of a timed automaton and\nthen transform the learning problem into learning the corresponding\nreset-clocked language of the target automaton. Since a reset-clocked language\nincludes the clock reset information which is not observable, we first present\nthe approach of learning from a powerful teacher who can provide reset\ninformation by answering reset information queries from the learner. Then we\nextend the algorithm in a normal teacher situation in which the learner can\nonly ask standard membership query and equivalence query while the learner\nguesses the reset information. We prove that the learning algorithm terminates\nand returns a correct deterministic timed automaton. Due to the need of\nguessing whether the clocks reset at the transitions, the algorithm is of\nexponential complexity in the size of the target automaton.\n","authors":["Yu Teng","Miaomiao Zhang","Jie An"],"pdf_url":"https://arxiv.org/pdf/2404.07823v2.pdf","comment":"20 pages. It is an author version of the paper with the same title\n accepted by HSCC 2024"},{"id":"http://arxiv.org/abs/2405.11849v1","updated":"2024-05-20T07:50:51Z","published":"2024-05-20T07:50:51Z","title":"Jumping Automata Must Pay","summary":" Jumping automata are finite automata that read their input in a\nnon-sequential manner, by allowing a reading head to ``jump'' between positions\non the input, consuming a permutation of the input word. We argue that allowing\nthe head to jump should incur some cost. To this end, we propose three\nquantitative semantics for jumping automata, whereby the jumps of the head in\nan accepting run define the cost of the run. The three semantics correspond to\ndifferent interpretations of jumps: the \\emph{absolute distance} semantics\ncounts the distance the head jumps, the \\emph{reversal} semantics counts the\nnumber of times the head changes direction, and the \\emph{Hamming distance}\nmeasures the number of letter-swaps the run makes.\n We study these measures, with the main focus being the \\emph{boundedness\nproblem}: given a jumping automaton, decide whether its (quantitative)\nlanguages is bounded by some given number $k$. We establish the decidability\nand complexity for this problem under several variants.\n","authors":["Shaull Almagor","Ishai Salgado"],"pdf_url":"https://arxiv.org/pdf/2405.11849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12350v1","updated":"2024-05-20T19:59:25Z","published":"2024-05-20T19:59:25Z","title":"A framework for extraction and transformation of documents","summary":" We present a theoretical framework for the extraction and transformation of\ntext documents. We propose to use a two-phase process where the first phase\nextracts span-tuples from a document, and the second phase maps the content of\nthe span-tuples into new documents. We base the extraction phase on the\nframework of document spanners and the transformation phase on the theory of\npolyregular functions, the class of regular string-to-string functions with\npolynomial growth.\n For supporting practical extract-transform scenarios, we propose an extension\nof document spanners described by regex formulas from span-tuples to so-called\nmultispan-tuples, where variables are mapped to sets of spans. We prove that\nthis extension, called regex multispanners, has the same desirable properties\nas standard spanners described by regex formulas. In our framework, an\nExtract-Transform (ET) program is given by a regex multispanner followed by a\npolyregular function.\n In this paper, we study the expressibility and evaluation problem of ET\nprograms when the transformation function is linear, called linear ET programs.\nWe show that linear ET programs are equally expressive as non-deterministic\nstreaming string transducers under bag semantics. Moreover, we show that linear\nET programs are closed under composition. Finally, we present an enumeration\nalgorithm for evaluating every linear ET program over a document with linear\ntime preprocessing and constant delay.\n","authors":["Cristian Riveros","Markus L. Schmid","Nicole Schweikardt"],"pdf_url":"https://arxiv.org/pdf/2405.12350v1.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2405.10887v2","updated":"2024-05-20T09:53:49Z","published":"2024-05-17T16:25:40Z","title":"Preservation theorems on sparse classes revisited","summary":" We revisit the work studying homomorphism preservation for first-order logic\nin sparse classes of structures initiated in [Atserias et al., JACM 2006] and\n[Dawar, JCSS 2010]. These established that first-order logic has the\nhomomorphism preservation property in any sparse class that is monotone and\naddable. It turns out that the assumption of addability is not strong enough\nfor the proofs given. We demonstrate this by constructing classes of graphs of\nbounded treewidth which are monotone and addable but fail to have homomorphism\npreservation. We also show that homomorphism preservation fails on the class of\nplanar graphs. On the other hand, the proofs of homomorphism preservation can\nbe recovered by replacing addability by a stronger condition of amalgamation\nover bottlenecks. This is analogous to a similar condition formulated for\nextension preservation in [Atserias et al., SiCOMP 2008].\n","authors":["Anuj Dawar","Ioannis Eleftheriadis"],"pdf_url":"https://arxiv.org/pdf/2405.10887v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2405.10308v2","updated":"2024-05-20T17:02:48Z","published":"2024-05-16T17:58:19Z","title":"Efficient Implementation of an Abstract Domain of Quantified First-Order\n Formulas","summary":" This paper lays a practical foundation for using abstract interpretation with\nan abstract domain that consists of sets of quantified first-order logic\nformulas. This abstract domain seems infeasible at first sight due to the\ncomplexity of the formulas involved and the enormous size of sets of formulas\n(abstract elements). We introduce an efficient representation of abstract\nelements, which eliminates redundancies based on a novel syntactic subsumption\nrelation that under-approximates semantic entailment. We develop algorithms and\ndata structures to efficiently compute the join of an abstract element with the\nabstraction of a concrete state, operating on the representation of abstract\nelements. To demonstrate feasibility of the domain, we use our data structures\nand algorithms to implement a symbolic abstraction algorithm that computes the\nleast fixpoint of the best abstract transformer of a transition system, which\ncorresponds to the strongest inductive invariant. We succeed at finding, for\nexample, the least fixpoint for Paxos (which in our representation has 1,438\nformulas with $\\forall^*\\exists^*\\forall^*$ quantification) in time comparable\nto state-of-the-art property-directed approaches.\n","authors":["Eden Frenkel","Tej Chajed","Oded Padon","Sharon Shoham"],"pdf_url":"https://arxiv.org/pdf/2405.10308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09504v2","updated":"2024-05-20T13:03:32Z","published":"2024-05-15T16:53:30Z","title":"Initial Algebras Unchained -- A Novel Initial Algebra Construction\n Formalized in Agda","summary":" The initial algebra for an endofunctor F provides a recursion and induction\nscheme for data structures whose constructors are described by F. The\ninitial-algebra construction by Ad\\'amek (1974) starts with the initial object\n(e.g. the empty set) and successively applies the functor until a fixed point\nis reached, an idea inspired by Kleene's fixed point theorem. Depending on the\nfunctor of interest, this may require transfinitely many steps indexed by\nordinal numbers until termination.\n We provide a new initial algebra construction which is not based on an\nordinal-indexed chain. Instead, our construction is loosely inspired by\nPataraia's fixed point theorem and forms the colimit of all finite recursive\ncoalgebras. This is reminiscent of the construction of the rational fixed point\nof an endofunctor that forms the colimit of all finite coalgebras. For our main\ncorrectness theorem, we assume the given endofunctor is accessible on a (weak\nform of) locally presentable category. Our proofs are constructive and fully\nformalized in Agda.\n","authors":["Thorsten Wißmann","Stefan Milius"],"pdf_url":"https://arxiv.org/pdf/2405.09504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07975v2","updated":"2024-05-20T16:40:35Z","published":"2024-05-13T17:48:45Z","title":"Dynamic Programming for Symbolic Boolean Realizability and Synthesis","summary":" Inspired by recent progress in dynamic programming approaches for weighted\nmodel counting, we investigate a dynamic-programming approach in the context of\nboolean realizability and synthesis, which takes a conjunctive-normal-form\nboolean formula over input and output variables, and aims at synthesizing\nwitness functions for the output variables in terms of the inputs. We show how\ngraded project-join trees, obtained via tree decomposition, can be used to\ncompute a BDD representing the realizability set for the input formulas in a\nbottom-up order. We then show how the intermediate BDDs generated during\nrealizability checking phase can be applied to synthesizing the witness\nfunctions in a top-down manner. An experimental evaluation of a solver --\nDPSynth -- based on these ideas demonstrates that our approach for Boolean\nrealizabilty and synthesis has superior time and space performance over a\nheuristics-based approach using same symbolic representations. We discuss the\nadvantage on scalability of the new approach, and also investigate our findings\non the performance of the DP framework.\n","authors":["Yi Lin","Lucas M. Tabajara","Moshe Y. Vardi"],"pdf_url":"https://arxiv.org/pdf/2405.07975v2.pdf","comment":"32 pages including appendices and bibliography, 5 figures, paper is\n to be published in CAV 2024, but this version is inclusive of the Appendix"},{"id":"http://arxiv.org/abs/2402.08434v2","updated":"2024-05-20T16:44:06Z","published":"2024-02-13T13:03:49Z","title":"Solving promise equations over monoids and groups","summary":" We give a complete complexity classification for the problem of finding a\nsolution to a given system of equations over a fixed finite monoid, given that\na solution over a more restricted monoid exists. As a corollary, we obtain a\ncomplexity classification for the same problem over groups.\n","authors":["Alberto Larrauri","Stanislav Živný"],"pdf_url":"https://arxiv.org/pdf/2402.08434v2.pdf","comment":"Full version of an ICALP 2024 paper"},{"id":"http://arxiv.org/abs/2405.12104v1","updated":"2024-05-20T15:21:12Z","published":"2024-05-20T15:21:12Z","title":"Deciding branching hyperproperties for real time systems","summary":" Security properties of real-time systems often involve reasoning about\nhyper-properties, as opposed to properties of single executions or trees of\nexecutions. These hyper-properties need to additionally be expressive enough to\nreason about real-time constraints. Examples of such properties include\ninformation flow, side channel attacks and service-level agreements. In this\npaper we study computational problems related to a branching-time,\nhyper-property extension of metric temporal logic (MTL) that we call HCMTL*. We\nconsider both the interval-based and point-based semantics of this logic. The\nverification problem that we consider is to determine if a given HCMTL* formula\n$\\varphi$ is true in a system represented by a timed automaton. We show that\nthis problem is undecidable. We then show that the verification problem is\ndecidable if we consider executions upto a fixed time horizon $T$. Our\ndecidability result relies on reducing the verification problem to the truth of\nan MSO formula over reals with a bounded time interval.\n","authors":["Nabarun Deka","Minjian Zhang","Rohit Chadha","Mahesh Viswanathan"],"pdf_url":"https://arxiv.org/pdf/2405.12104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03103v2","updated":"2024-05-20T10:28:52Z","published":"2024-02-05T15:31:41Z","title":"Scoped Effects as Parameterized Algebraic Theories","summary":" Notions of computation can be modelled by monads. Algebraic effects offer a\ncharacterization of monads in terms of algebraic operations and equational\naxioms, where operations are basic programming features, such as reading or\nupdating the state, and axioms specify observably equivalent expressions.\nHowever, many useful programming features depend on additional mechanisms such\nas delimited scopes or dynamically allocated resources. Such mechanisms can be\nsupported via extensions to algebraic effects including scoped effects and\nparameterized algebraic theories. We present a fresh perspective on scoped\neffects by translation into a variation of parameterized algebraic theories.\nThe translation enables a new approach to equational reasoning for scoped\neffects and gives rise to an alternative characterization of monads in terms of\ngenerators and equations involving both scoped and algebraic operations. We\ndemonstrate the power of our fresh perspective by way of equational\ncharacterizations of several known models of scoped effects.\n","authors":["Cristina Matache","Sam Lindley","Sean Moss","Sam Staton","Nicolas Wu","Zhixuan Yang"],"pdf_url":"https://arxiv.org/pdf/2402.03103v2.pdf","comment":"Extended version of the ESOP 2024 paper with the same title"},{"id":"http://arxiv.org/abs/2401.14461v2","updated":"2024-05-20T05:52:05Z","published":"2024-01-25T19:00:25Z","title":"Marabou 2.0: A Versatile Formal Analyzer of Neural Networks","summary":" This paper serves as a comprehensive system description of version 2.0 of the\nMarabou framework for formal analysis of neural networks. We discuss the tool's\narchitectural design and highlight the major features and components introduced\nsince its initial release.\n","authors":["Haoze Wu","Omri Isac","Aleksandar Zeljić","Teruhiro Tagomori","Matthew Daggitt","Wen Kokke","Idan Refaeli","Guy Amir","Kyle Julian","Shahaf Bassan","Pei Huang","Ori Lahav","Min Wu","Min Zhang","Ekaterina Komendantskaya","Guy Katz","Clark Barrett"],"pdf_url":"https://arxiv.org/pdf/2401.14461v2.pdf","comment":"Condensed version accepted at CAV'24"},{"id":"http://arxiv.org/abs/2405.11706v1","updated":"2024-05-20T00:28:00Z","published":"2024-05-20T00:28:00Z","title":"Increasing the LLM Accuracy for Question Answering: Ontologies to the\n Rescue!","summary":" There is increasing evidence that question-answering (QA) systems with Large\nLanguage Models (LLMs), which employ a knowledge graph/semantic representation\nof an enterprise SQL database (i.e. Text-to-SPARQL), achieve higher accuracy\ncompared to systems that answer questions directly on SQL databases (i.e.\nText-to-SQL). Our previous benchmark research showed that by using a knowledge\ngraph, the accuracy improved from 16% to 54%. The question remains: how can we\nfurther improve the accuracy and reduce the error rate? Building on the\nobservations of our previous research where the inaccurate LLM-generated SPARQL\nqueries followed incorrect paths, we present an approach that consists of 1)\nOntology-based Query Check (OBQC): detects errors by leveraging the ontology of\nthe knowledge graph to check if the LLM-generated SPARQL query matches the\nsemantic of ontology and 2) LLM Repair: use the error explanations with an LLM\nto repair the SPARQL query. Using the chat with the data benchmark, our primary\nfinding is that our approach increases the overall accuracy to 72% including an\nadditional 8% of \"I don't know\" unknown results. Thus, the overall error rate\nis 20%. These results provide further evidence that investing knowledge graphs,\nnamely the ontology, provides higher accuracy for LLM powered question\nanswering systems.\n","authors":["Dean Allemang","Juan Sequeda"],"pdf_url":"https://arxiv.org/pdf/2405.11706v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2405.07953v2","updated":"2024-05-20T22:46:06Z","published":"2024-05-13T17:28:01Z","title":"On the Decidability of Monadic Second-Order Logic with Arithmetic\n Predicates","summary":" We investigate the decidability of the monadic second-order (MSO) theory of\nthe structure $\\langle \\mathbb{N};<,P_1, \\ldots,P_k \\rangle$, for various unary\npredicates $P_1,\\ldots,P_k \\subseteq \\mathbb{N}$. We focus in particular on\n\"arithmetic\" predicates arising in the study of linear recurrence sequences,\nsuch as fixed-base powers $\\mathsf{Pow}_k = \\{k^n : n \\in \\mathbb{N}\\}$, $k$-th\npowers $\\mathsf{N}_k = \\{n^k : n \\in \\mathbb{N}\\}$, and the set of terms of the\nFibonacci sequence $\\mathsf{Fib} = \\{0,1,2,3,5,8,13,\\ldots\\}$ (and similarly\nfor other linear recurrence sequences having a single, non-repeated, dominant\ncharacteristic root). We obtain several new unconditional and conditional\ndecidability results, a select sample of which are the following:\n $\\bullet$ The MSO theory of $\\langle \\mathbb{N};<,\\mathsf{Pow}_2,\n\\mathsf{Fib} \\rangle$ is decidable;\n $\\bullet$ The MSO theory of $\\langle \\mathbb{N};<, \\mathsf{Pow}_2,\n\\mathsf{Pow}_3, \\mathsf{Pow}_6 \\rangle$ is decidable;\n $\\bullet$ The MSO theory of $\\langle \\mathbb{N};<, \\mathsf{Pow}_2,\n\\mathsf{Pow}_3, \\mathsf{Pow}_5 \\rangle$ is decidable assuming Schanuel's\nconjecture;\n $\\bullet$ The MSO theory of $\\langle \\mathbb{N};<, \\mathsf{Pow}_4,\n\\mathsf{N}_2 \\rangle$ is decidable;\n $\\bullet$ The MSO theory of $\\langle \\mathbb{N};<, \\mathsf{Pow}_2,\n\\mathsf{N}_2 \\rangle$ is Turing-equivalent to the MSO theory of $\\langle\n\\mathbb{N};<,S \\rangle$, where $S$ is the predicate corresponding to the binary\nexpansion of $\\sqrt{2}$. (As the binary expansion of $\\sqrt{2}$ is widely\nbelieved to be normal, the corresponding MSO theory is in turn expected to be\ndecidable.)\n These results are obtained by exploiting and combining techniques from\ndynamical systems, number theory, and automata theory.\n","authors":["Valérie Berthé","Toghrul Karimov","Joris Nieuwveld","Joël Ouaknine","Mihir Vahanwala","James Worrell"],"pdf_url":"https://arxiv.org/pdf/2405.07953v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2304.09231v2","updated":"2024-05-20T20:21:17Z","published":"2023-04-18T18:42:35Z","title":"Embedded Finite Models Beyond Restricted Quantifier Collapse","summary":" We revisit evaluation of logical formulas that allow both uninterpreted\nrelations, constrained to be finite, as well as an interpreted vocabulary over\nan infinite domain. This formalism was denoted embedded finite model theory in\nthe past.\n It is clear that the expressiveness and evaluating complexity of formulas of\nthis type depends heavily on the infinite structure. If we embed in a wild\nstructure like the integers with additive and multiplicative arithmetic, logic\nis extremely expressive and formulas are impossible to evaluate. On the other\nhand, for some well-known decidable structures, the expressiveness and\nevaluating complexity are similar to the situation without the additional\ninfrastructure. The latter phenomenon was formalized via the notion of\n``Restricted Quantifier Collapse'': adding quantification over the infinite\nstructure does not add expressiveness. Beyond these two extremes little was\nknown.\n In this work we show that the possibilities for expressiveness and complexity\nare much wider. We show that we can get almost any possible complexity of\nevaluation while staying within a decidable structure. We also show that in\nsome decidable structures, there is a disconnect between expressiveness of the\nlogic and complexity, in that we cannot eliminate quantification over the\nstructure, but this is not due to an ability to embed complex relational\ncomputation in the logic.\n We show failure of collapse for the theory of finite fields and the related\ntheory of pseudo-finite fields, which will involve coding computation in the\nlogic. As a by-product of this, we establish new lower-bounds for the\ncomplexity of decision procedures for several decidable theories of fields,\nincluding the theory of finite fields.\n In the process of investigating this landscape, we investigate several\nweakenings of collapse.\n","authors":["Michael Benedikt","Ehud Hrushovski"],"pdf_url":"https://arxiv.org/pdf/2304.09231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12358v1","updated":"2024-05-20T20:14:32Z","published":"2024-05-20T20:14:32Z","title":"Using Color Refinement to Boost Enumeration and Counting for Acyclic CQs\n of Binary Schemas","summary":" We present an index structure, called the color-index, to boost the\nevaluation of acyclic conjunctive queries (ACQs) over binary schemas. The\ncolor-index is based on the color refinement algorithm, a widely used\nsubroutine for graph isomorphism testing algorithms. Given a database $D$, we\nuse a suitable version of the color refinement algorithm to produce a stable\ncoloring of $D$, an assignment from the active domain of $D$ to a set of colors\n$C_D$. The main ingredient of the color-index is a particular database $D_c$\nwhose active domain is $C_D$ and whose size is at most $|D|$. Using the\ncolor-index, we can evaluate any free-connex ACQ $Q$ over $D$ with\npreprocessing time $O(|Q| \\cdot |D_c|)$ and constant delay enumeration.\nFurthermore, we can also count the number of results of $Q$ over $D$ in time\n$O(|Q| \\cdot |D_c|)$. Given that $|D_c|$ could be much smaller than $|D|$ (even\nconstant-size for some families of databases), the color-index is the first\nindex structure for evaluating free-connex ACQs that allows efficient\nenumeration and counting with performance that may be strictly smaller than the\ndatabase size.\n","authors":["Cristian Riveros","Benjamin Scheidt","Nicole Schweikardt"],"pdf_url":"https://arxiv.org/pdf/2405.12358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.08181v3","updated":"2024-05-20T10:36:14Z","published":"2021-01-20T15:29:27Z","title":"Fair Asynchronous Session Subtyping","summary":" Session types are widely used as abstractions of asynchronous message passing\nsystems. Refinement for such abstractions is crucial as it allows improvements\nof a given component without compromising its compatibility with the rest of\nthe system. In the context of session types, the most general notion of\nrefinement is asynchronous session subtyping, which allows message emissions to\nbe anticipated w.r.t. a bounded amount of message consumptions. In this paper\nwe investigate the possibility to anticipate emissions w.r.t. an unbounded\namount of consumptions: to this aim we propose to consider fair compliance over\nasynchronous session types and fair refinement as the relation that preserves\nit. This allows us to propose a novel variant of session subtyping that\nleverages the notion of controllability from service contract theory and that\nis a sound characterisation of fair refinement. In addition, we show that both\nfair refinement and our novel subtyping are undecidable. We also present a\nsound algorithm which deals with examples that feature potentially unbounded\nbuffering. Finally, we present an implementation of our algorithm and an\nempirical evaluation of it on synthetic benchmarks.\n","authors":["Mario Bravetti","Julien Lange","Gianluigi Zavattaro"],"pdf_url":"https://arxiv.org/pdf/2101.08181v3.pdf","comment":null}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2311.18677v2","updated":"2024-05-20T15:37:36Z","published":"2023-11-30T16:24:42Z","title":"Splitwise: Efficient generative LLM inference using phase splitting","summary":" Recent innovations in generative large language models (LLMs) have made their\napplications and use-cases ubiquitous. This has led to large-scale deployments\nof these models, using complex, expensive, and power-hungry AI accelerators,\nmost commonly GPUs. These developments make LLM inference efficiency an\nimportant challenge. Based on our extensive characterization, we find that\nthere are two main phases during an LLM inference request: a compute-intensive\nprompt computation, and a memory-intensive token generation, each with distinct\nlatency, throughput, memory, and power characteristics. Despite\nstate-of-the-art batching and scheduling, the token generation phase\nunderutilizes compute resources. Specifically, unlike compute-intensive prompt\ncomputation phases, token generation phases do not require the compute\ncapability of the latest GPUs, and can be run with lower power and cost.\n With Splitwise, we propose splitting the two phases of a LLM inference\nrequest on to separate machines. This allows us to use hardware that is\nwell-suited for each phase, and provision resources independently per phase.\nHowever, splitting an inference request across machines requires state transfer\nfrom the machine running prompt computation over to the machine generating\ntokens. We implement and optimize this state transfer using the fast back-plane\ninterconnects available in today's GPU clusters.\n We use the Splitwise technique to design LLM inference clusters using the\nsame or different types of machines for the prompt computation and token\ngeneration phases. Our clusters are optimized for three key objectives:\nthroughput, cost, and power. In particular, we show that we can achieve 1.4x\nhigher throughput at 20% lower cost than current designs. Alternatively, we can\nachieve 2.35x more throughput with the same cost and power budgets.\n","authors":["Pratyush Patel","Esha Choukse","Chaojie Zhang","Aashaka Shah","Íñigo Goiri","Saeed Maleki","Ricardo Bianchini"],"pdf_url":"https://arxiv.org/pdf/2311.18677v2.pdf","comment":"12 pages, 19 figures"},{"id":"http://arxiv.org/abs/2405.12089v1","updated":"2024-05-20T15:02:13Z","published":"2024-05-20T15:02:13Z","title":"Using Formal Verification to Evaluate Single Event Upsets in a RISC-V\n Core","summary":" Reliability has been a major concern in embedded systems. Higher transistor\ndensity and lower voltage supply increase the vulnerability of embedded systems\nto soft errors. A Single Event Upset (SEU), which is also called a soft error,\ncan reverse a bit in a sequential element, resulting in a system failure.\nSimulation-based fault injection has been widely used to evaluate reliability,\nas suggested by ISO26262. However, it is practically impossible to test all\nfaults for a complex design. Random fault injection is a compromise that\nreduces accuracy and fault coverage. Formal verification is an alternative\napproach. In this paper, we use formal verification, in the form of model\nchecking, to evaluate the hardware reliability of a RISC-V Ibex Core in the\npresence of soft errors. Backward tracing is performed to identify and\ncategorize faults according to their effects (no effect, Silent Data\nCorruption, crashes, and hangs). By using formal verification, the entire state\nspace and fault list can be exhaustively explored. It is found that misaligned\ninstructions can amplify fault effects. It is also found that some bits are\nmore vulnerable to SEUs than others. In general, most of the bits in the Ibex\nCore are vulnerable to Silent Data Corruption, and the second pipeline stage is\nmore vulnerable to Silent Data Corruption than the first.\n","authors":["Bing Xue","Mark Zwolinski"],"pdf_url":"https://arxiv.org/pdf/2405.12089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11844v1","updated":"2024-05-20T07:38:19Z","published":"2024-05-20T07:38:19Z","title":"NeRTCAM: CAM-Based CMOS Implementation of Reference Frames for\n Neuromorphic Processors","summary":" Neuromorphic architectures mimicking biological neural networks have been\nproposed as a much more efficient alternative to conventional von Neumann\narchitectures for the exploding compute demands of AI workloads. Recent\nneuroscience theory on intelligence suggests that Cortical Columns (CCs) are\nthe fundamental compute units in the neocortex and intelligence arises from\nCC's ability to store, predict and infer information via structured Reference\nFrames (RFs). Based on this theory, recent works have demonstrated brain-like\nvisual object recognition using software simulation. Our work is the first\nattempt towards direct CMOS implementation of Reference Frames for building\nCC-based neuromorphic processors. We propose NeRTCAM (Neuromorphic Reverse\nTernary Content Addressable Memory), a CAM-based building block that supports\nthe key operations (store, predict, infer) required to perform inference using\nRFs. NeRTCAM architecture is presented in detail including its key components.\nAll designs are implemented in SystemVerilog and synthesized in 7nm CMOS, and\nhardware complexity scaling is evaluated for varying storage sizes. NeRTCAM\nsystem for biologically motivated MNIST inference with a storage size of 1024\nentries incurs just 0.15 mm^2 area, 400 mW power and 9.18 us critical path\nlatency, demonstrating the feasibility of direct CMOS implementation of\nCAM-based Reference Frames.\n","authors":["Harideep Nair","William Leyman","Agastya Sampath","Quinn Jacobson","John Paul Shen"],"pdf_url":"https://arxiv.org/pdf/2405.11844v1.pdf","comment":"Accepted and Presented at Neuro-Inspired Computational Elements\n (NICE) Conference, La Jolla, CA. 2024"},{"id":"http://arxiv.org/abs/2311.16543v3","updated":"2024-05-20T19:50:06Z","published":"2023-11-28T06:18:54Z","title":"RTLFixer: Automatically Fixing RTL Syntax Errors with Large Language\n Models","summary":" This paper presents RTLFixer, a novel framework enabling automatic syntax\nerrors fixing for Verilog code with Large Language Models (LLMs). Despite LLM's\npromising capabilities, our analysis indicates that approximately 55% of errors\nin LLM-generated Verilog are syntax-related, leading to compilation failures.\nTo tackle this issue, we introduce a novel debugging framework that employs\nRetrieval-Augmented Generation (RAG) and ReAct prompting, enabling LLMs to act\nas autonomous agents in interactively debugging the code with feedback. This\nframework demonstrates exceptional proficiency in resolving syntax errors,\nsuccessfully correcting about 98.5% of compilation errors in our debugging\ndataset, comprising 212 erroneous implementations derived from the VerilogEval\nbenchmark. Our method leads to 32.3% and 10.1% increase in pass@1 success rates\nin the VerilogEval-Machine and VerilogEval-Human benchmarks, respectively.\n","authors":["Yun-Da Tsai","Mingjie Liu","Haoxing Ren"],"pdf_url":"https://arxiv.org/pdf/2311.16543v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02666v4","updated":"2024-05-20T18:38:19Z","published":"2023-07-05T21:42:24Z","title":"Chiplet Cloud: Building AI Supercomputers for Serving Large Generative\n Language Models","summary":" Large language models (LLMs) such as OpenAI's ChatGPT and Google's Gemini\nhave demonstrated unprecedented capabilities of autoregressive AI models across\nmultiple tasks triggering disruptive technology innovations around the world.\nHowever, as models continue to grow the cost to serve these models also\ncontinues to grow threatening the democratization of LLMs.\n To address this issue, we propose Chiplet Cloud, a chiplet-based ASIC\nLLM-supercomputer architecture whose goal is to optimize the total cost of\nownership (TCO) per generated token. This architecture is a highly\nparameterizable ASIC and server-level architecture leveraging thousands of\nreplicated accelerator modules collaborating to scale-up the performance of\nLLMs at cloud-scale. To determine specific parameterizations of the Chiplet\nCloud architecture, we implemented a two-phase hardware-software co-design\nmethodology that can search the massive design space and fine tune the\narchitecture across a collection of LLMs based on an accurate inference\nsimulation. A common bottleneck for LLMs is the memory access performance\ntherefore we introduce CC-MEM, a scalable on-chip memory system for Chiplet\nCloud architectures. Using the CC-MEM, Chiplet Clouds can be built using only\nSRAMs for design points where the power and performance of memory access is\ncritical. The CC-MEM also includes a compression decoder module to add support\nfor sparse models without impacting the compute units using a\nStore-as-Compressed, Load-as-Dense mechanism.\n We evaluate Chiplet Cloud architectures across eight popular LLMs. Using fine\ntuned Chiplet Cloud servers we are able to achieve $97\\times$ and $18\\times$\nimprovement in TCO/Token over rented GPU and TPU clouds, or a $8.3\\times$ and\n$3.7\\times$ improvement over fabricated GPU and TPU clouds respectively.\nChiplet Cloud can also support $1.7\\times$ larger models with a sparsity of\n60\\%.\n","authors":["Huwan Peng","Scott Davidson","Richard Shi","Shuaiwen Leon Song","Michael Taylor"],"pdf_url":"https://arxiv.org/pdf/2307.02666v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12304v1","updated":"2024-05-20T18:11:45Z","published":"2024-05-20T18:11:45Z","title":"Automatic Hardware Pragma Insertion in High-Level Synthesis: A\n Non-Linear Programming Approach","summary":" High-level synthesis, source-to-source compilers, and various Design Space\nExploration techniques for pragma insertion have significantly improved the\nQuality of Results of generated designs. These tools offer benefits such as\nreduced development time and enhanced performance. However, achieving\nhigh-quality results often requires additional manual code transformations and\ntiling selections, which are typically performed separately or as\npre-processing steps. Although DSE techniques enable code transformation\nupfront, the vastness of the search space often limits the exploration of all\npossible code transformations, making it challenging to determine which\ntransformations are necessary. Additionally, ensuring correctness remains\nchallenging, especially for complex transformations and optimizations.\n To tackle this obstacle, we first propose a comprehensive framework\nleveraging HLS compilers. Our system streamlines code transformation, pragma\ninsertion, and tiles size selection for on-chip data caching through a unified\noptimization problem, aiming to enhance parallelization, particularly\nbeneficial for computation-bound kernels. Them employing a novel Non-Linear\nProgramming (NLP) approach, we simultaneously ascertain transformations,\npragmas, and tile sizes, focusing on regular loop-based kernels. Our evaluation\ndemonstrates that our framework adeptly identifies the appropriate\ntransformations, including scenarios where no transformation is necessary, and\ninserts pragmas to achieve a favorable Quality of Results.\n","authors":["Stéphane Pouget","Louis-Noël Pouchet","Jason Cong"],"pdf_url":"https://arxiv.org/pdf/2405.12304v1.pdf","comment":null}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2405.12182v1","updated":"2024-05-20T17:07:30Z","published":"2024-05-20T17:07:30Z","title":"Nearest Neighbors GParareal: Improving Scalability of Gaussian Processes\n for Parallel-in-Time Solvers","summary":" With the advent of supercomputers, multi-processor environments and\nparallel-in-time (PinT) algorithms offer ways to solve initial value problems\nfor ordinary and partial differential equations (ODEs and PDEs) over long time\nintervals, a task often unfeasible with sequential solvers within realistic\ntime frames. A recent approach, GParareal, combines Gaussian Processes with\ntraditional PinT methodology (Parareal) to achieve faster parallel speed-ups.\nThe method is known to outperform Parareal for low-dimensional ODEs and a\nlimited number of computer cores. Here, we present Nearest Neighbors GParareal\n(nnGParareal), a novel data-enriched PinT integration algorithm. nnGParareal\nbuilds upon GParareal by improving its scalability properties for\nhigher-dimensional systems and increased processor count. Through data\nreduction, the model complexity is reduced from cubic to log-linear in the\nsample size, yielding a fast and automated procedure to integrate initial value\nproblems over long time intervals. First, we provide both an upper bound for\nthe error and theoretical details on the speed-up benefits. Then, we\nempirically illustrate the superior performance of nnGParareal, compared to\nGParareal and Parareal, on nine different systems with unique features (e.g.,\nstiff, chaotic, high-dimensional, or challenging-to-learn systems).\n","authors":["Guglielmo Gattiglio","Lyudmila Grigoryeva","Massimiliano Tamborrino"],"pdf_url":"https://arxiv.org/pdf/2405.12182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13938v3","updated":"2024-05-20T16:04:08Z","published":"2023-12-21T15:32:20Z","title":"How Does Stake Distribution Influence Consensus? Analyzing Blockchain\n Decentralization","summary":" In the PoS blockchain landscape, the challenge of achieving full\ndecentralization is often hindered by a disproportionate concentration of\nstaked tokens among a few validators. This study analyses this challenge by\nfirst formalizing decentralization metrics for weighted consensus mechanisms.\nAn empirical analysis across ten permissionless blockchains uncovers\nsignificant weight concentration among validators, underscoring the need for an\nequitable approach. To counter this, we introduce the Square Root Stake Weight\n(SRSW) model, which effectively recalibrates staking weight distribution. Our\nexamination of the SRSW model demonstrates notable improvements in the\ndecentralization metrics: the Gini index improves by 37.16% on average, while\nNakamoto coefficients for liveness and safety see mean enhancements of 101.04%\nand 80.09%, respectively. This research is a pivotal step toward a more fair\nand equitable distribution of staking weight, advancing the decentralization in\nblockchain consensus mechanisms.\n","authors":["Shashank Motepalli","Hans-Arno Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2312.13938v3.pdf","comment":"To appear in ICBC 2024"},{"id":"http://arxiv.org/abs/2405.12120v1","updated":"2024-05-20T15:38:53Z","published":"2024-05-20T15:38:53Z","title":"EdgeLoc: A Communication-Adaptive Parallel System for Real-Time\n Localization in Infrastructure-Assisted Autonomous Driving","summary":" This paper presents EdgeLoc, an infrastructure-assisted, real-time\nlocalization system for autonomous driving that addresses the incompatibility\nbetween traditional localization methods and deep learning approaches. The\nsystem is built on top of the Robot Operating System (ROS) and combines the\nreal-time performance of traditional methods with the high accuracy of deep\nlearning approaches. The system leverages edge computing capabilities of\nroadside units (RSUs) for precise localization to enhance on-vehicle\nlocalization that is based on the real-time visual odometry. EdgeLoc is a\nparallel processing system, utilizing a proposed uncertainty-aware pose fusion\nsolution. It achieves communication adaptivity through online learning and\naddresses fluctuations via window-based detection. Moreover, it achieves\noptimal latency and maximum improvement by utilizing auto-splitting\nvehicle-infrastructure collaborative inference, as well as online distribution\nlearning for decision-making. Even with the most basic end-to-end deep neural\nnetwork for localization estimation, EdgeLoc realizes a 67.75\\% reduction in\nthe localization error for real-time local visual odometry, a 29.95\\% reduction\nfor non-real-time collaborative inference, and a 30.26\\% reduction compared to\nKalman filtering. Finally, accuracy-to-latency conversion was experimentally\nvalidated, and an overall experiment was conducted on a practical cellular\nnetwork. The system is open sourced at\nhttps://github.com/LoganCome/EdgeAssistedLocalization.\n","authors":["Boyi Liu","Jingwen Tong","Yufan Zhuang","Jiawei Shao","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.12120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18677v2","updated":"2024-05-20T15:37:36Z","published":"2023-11-30T16:24:42Z","title":"Splitwise: Efficient generative LLM inference using phase splitting","summary":" Recent innovations in generative large language models (LLMs) have made their\napplications and use-cases ubiquitous. This has led to large-scale deployments\nof these models, using complex, expensive, and power-hungry AI accelerators,\nmost commonly GPUs. These developments make LLM inference efficiency an\nimportant challenge. Based on our extensive characterization, we find that\nthere are two main phases during an LLM inference request: a compute-intensive\nprompt computation, and a memory-intensive token generation, each with distinct\nlatency, throughput, memory, and power characteristics. Despite\nstate-of-the-art batching and scheduling, the token generation phase\nunderutilizes compute resources. Specifically, unlike compute-intensive prompt\ncomputation phases, token generation phases do not require the compute\ncapability of the latest GPUs, and can be run with lower power and cost.\n With Splitwise, we propose splitting the two phases of a LLM inference\nrequest on to separate machines. This allows us to use hardware that is\nwell-suited for each phase, and provision resources independently per phase.\nHowever, splitting an inference request across machines requires state transfer\nfrom the machine running prompt computation over to the machine generating\ntokens. We implement and optimize this state transfer using the fast back-plane\ninterconnects available in today's GPU clusters.\n We use the Splitwise technique to design LLM inference clusters using the\nsame or different types of machines for the prompt computation and token\ngeneration phases. Our clusters are optimized for three key objectives:\nthroughput, cost, and power. In particular, we show that we can achieve 1.4x\nhigher throughput at 20% lower cost than current designs. Alternatively, we can\nachieve 2.35x more throughput with the same cost and power budgets.\n","authors":["Pratyush Patel","Esha Choukse","Chaojie Zhang","Aashaka Shah","Íñigo Goiri","Saeed Maleki","Ricardo Bianchini"],"pdf_url":"https://arxiv.org/pdf/2311.18677v2.pdf","comment":"12 pages, 19 figures"},{"id":"http://arxiv.org/abs/2405.12117v1","updated":"2024-05-20T15:32:47Z","published":"2024-05-20T15:32:47Z","title":"Strongly-Consistent Distributed Discrete-event Systems","summary":" Discrete-event (DE) systems are concurrent programs where components\ncommunicate via tagged events, where tags are drawn from a totally ordered set.\nReactors are an emerging model of computation based on DE and realized in the\nopen-source coordination language Lingua Franca. Distributed DE (DDE) systems\nare DE systems where the components (reactors) communicate over networks. The\nprior art has required that for DDE systems with cycles, each cycle must\ncontain at least one logical delay, where the tag of events is incremented.\nSuch delays, however, are not required by the elegant fixed-point semantics of\nDE. The only requirement is that the program be constructive, meaning it is\nfree of causality cycles. This paper gives a way to coordinate the execution of\nDDE systems that can execute any constructive program, even one with zero-delay\ncycles. It provides a formal model that exposes exactly the information that\nmust be shared across networks for such execution to be possible. Furthermore,\nit describes a concrete implementation that is an extension of the coordination\nmechanisms in Lingua Franca.\n","authors":["Peter Donovan","Erling Jellum","Byeonggil Jun","Hokeun Kim","Edward A. Lee","Shaokai Lin","Marten Lohstroh","Anirudh Rengarajan"],"pdf_url":"https://arxiv.org/pdf/2405.12117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12079v1","updated":"2024-05-20T14:49:45Z","published":"2024-05-20T14:49:45Z","title":"PARALLELGPUOS: A Concurrent OS-level GPU Checkpoint and Restore System\n using Validated Speculation","summary":" Checkpointing (C) and restoring (R) are key components for GPU tasks. POS is\nan OS-level GPU C/R system: It can transparently checkpoint or restore\nprocesses that use the GPU, without requiring any cooperation from the\napplication, a key feature required by modern systems like the cloud. Moreover,\nPOS is the first OS-level C/R system that can concurrently execute C/R with the\napplication execution: a critical feature that can be trivially achieved when\nthe processes only running on the CPU, but becomes challenging when the\nprocesses use GPU. The problem is how to ensure consistency during concurrent\nexecution with the lack of application semantics due to transparency. CPU\nprocesses can leverage OS and hardware paging to fix inconsistency without\napplication semantics. Unfortunately, GPU bypasses OS and paging for high\nperformance. POS fills the semantic gap by speculatively extracting buffer\naccess information of GPU kernels during runtime. Thanks to the simple and\nwell-structured nature of GPU kernels, our speculative extraction (with runtime\nvalidation) achieves 100% accuracy on applications from training to inference\nwhose domains span from vision, large language models, and reinforcement\nlearning. Based on the extracted semantics, we systematically overlap C/R with\napplication execution, and achieves orders of magnitude higher performance\nunder various tasks compared with the state-of-the-art OS-level GPU C/R,\nincluding training fault tolerance, live GPU process migration, and cold starts\nacceleration in GPU-based serverless computing.\n","authors":["Zhuobin Huang","Xingda Wei","Yingyi Hao","Rong Chen","Mingcong Han","Jinyu Gu","Haibo Chen"],"pdf_url":"https://arxiv.org/pdf/2405.12079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11342v3","updated":"2024-05-20T14:29:06Z","published":"2023-11-19T14:56:26Z","title":"On the Communication Complexity of Decentralized Bilevel Optimization","summary":" Decentralized bilevel optimization has been actively studied in the past few\nyears since it has widespread applications in machine learning. However,\nexisting algorithms suffer from large communication complexity caused by the\nestimation of stochastic hypergradient, limiting their application to\nreal-world tasks. To address this issue, we develop a novel decentralized\nstochastic bilevel gradient descent algorithm under the heterogeneous setting,\nwhich enjoys a small communication cost in each round and a small number of\ncommunication rounds. As such, it can achieve a much better communication\ncomplexity than existing algorithms without any strong assumptions regarding\nheterogeneity. To the best of our knowledge, this is the first stochastic\nalgorithm achieving these theoretical results under the heterogeneous setting.\nAt last, the experimental results confirm the efficacy of our algorithm.\n","authors":["Yihan Zhang","My T. Thai","Jie Wu","Hongchang Gao"],"pdf_url":"https://arxiv.org/pdf/2311.11342v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12052v1","updated":"2024-05-20T14:18:36Z","published":"2024-05-20T14:18:36Z","title":"Parallelization of the K-Means Algorithm with Applications to Big Data\n Clustering","summary":" The K-Means clustering using LLoyd's algorithm is an iterative approach to\npartition the given dataset into K different clusters. The algorithm assigns\neach point to the cluster based on the following objective function\n \\[\\ \\min \\Sigma_{i=1}^{n}||x_i-\\mu_{x_i}||^2\\] The serial algorithm involves\niterative steps where we compute the distance of each datapoint from the\ncentroids and assign the datapoint to the nearest centroid. This approach is\nessentially known as the expectation-maximization step. Clustering involves\nextensive computations to calculate distances at each iteration, which\nincreases as the number of data points increases. This provides scope for\nparallelism. However, we must ensure that in a parallel process, each thread\nhas access to the updated centroid value and no racing condition exists on any\ncentroid values. We will compare two different approaches in this project. The\nfirst approach is an OpenMP flat synchronous method where all processes are run\nin parallel, and we use synchronization to ensure safe updates of clusters. The\nsecond approach we adopt is a GPU based parallelization approach using OpenACC\nwherein we will try to make use of GPU architecture to parallelize chunks of\nthe algorithm to observe decreased computation time. We will analyze metrics\nsuch as speed up, efficiency,time taken with varying data points, and number of\nprocesses to compare the two approaches and understand the relative performance\nimprovement we can get.\n","authors":["Ashish Srivastava","Mohammed Nawfal"],"pdf_url":"https://arxiv.org/pdf/2405.12052v1.pdf","comment":"7 Pages, 5 tables, 12 figures"},{"id":"http://arxiv.org/abs/2405.12046v1","updated":"2024-05-20T14:13:22Z","published":"2024-05-20T14:13:22Z","title":"Energy-Efficient Federated Edge Learning with Streaming Data: A Lyapunov\n Optimization Approach","summary":" Federated learning (FL) has received significant attention in recent years\nfor its advantages in efficient training of machine learning models across\ndistributed clients without disclosing user-sensitive data. Specifically, in\nfederated edge learning (FEEL) systems, the time-varying nature of wireless\nchannels introduces inevitable system dynamics in the communication process,\nthereby affecting training latency and energy consumption. In this work, we\nfurther consider a streaming data scenario where new training data samples are\nrandomly generated over time at edge devices. Our goal is to develop a dynamic\nscheduling and resource allocation algorithm to address the inherent randomness\nin data arrivals and resource availability under long-term energy constraints.\nTo achieve this, we formulate a stochastic network optimization problem and use\nthe Lyapunov drift-plus-penalty framework to obtain a dynamic resource\nmanagement design. Our proposed algorithm makes adaptive decisions on device\nscheduling, computational capacity adjustment, and allocation of bandwidth and\ntransmit power in every round. We provide convergence analysis for the\nconsidered setting with heterogeneous data and time-varying objective\nfunctions, which supports the rationale behind our proposed scheduling design.\nThe effectiveness of our scheme is verified through simulation results,\ndemonstrating improved learning performance and energy efficiency as compared\nto baseline schemes.\n","authors":["Chung-Hsuan Hu","Zheng Chen","Erik G. Larsson"],"pdf_url":"https://arxiv.org/pdf/2405.12046v1.pdf","comment":"Submitted to IEEE journals for possible publication"},{"id":"http://arxiv.org/abs/2306.17453v3","updated":"2024-05-20T10:41:45Z","published":"2023-06-30T07:57:30Z","title":"Pollen: High-throughput Federated Learning Simulation via Resource-Aware\n Client Placement","summary":" Federated Learning (FL) is a privacy-focused machine learning paradigm that\ncollaboratively trains models directly on edge devices. Simulation plays an\nessential role in FL adoption, helping develop novel aggregation and client\nsampling strategies. However, current simulators cannot emulate large-scale\nsystems in a time-efficient manner, which limits their utility and casts doubts\non generalizability.\n This work proposes Pollen, a novel resource-aware system for speeding up\nsimulations. Pollen addresses two limiting factors from existing simulators:\n(a) communication inefficiency derived from pull-based client execution and (b)\ninadequate load balance when using heterogeneous hardware. Pollen executes\nhigh-throughput FL simulations at scale by (a) using a push-based client\nplacement system, (b) learning how an adaptable scheduling of clients based on\nhardware statistics (c) estimating the optimal number of concurrent workers per\nGPU. We evaluate Pollen on four representative FL tasks and show that Pollen's\nplacement model increases GPU utilization and reduces idle time. We compare\nPollen to Flower, Flute, FedScale, Parrot, and pfl and show experimental\nspeed-ups of days or weeks.\n","authors":["Lorenzo Sani","Pedro Porto Buarque de Gusmão","Alex Iacob","Wanru Zhao","Xinchi Qiu","Yan Gao","Javier Fernandez-Marques","Nicholas Donald Lane"],"pdf_url":"https://arxiv.org/pdf/2306.17453v3.pdf","comment":"22 pages, 22 figures, 9 tables, under review"},{"id":"http://arxiv.org/abs/2405.11884v1","updated":"2024-05-20T08:57:39Z","published":"2024-05-20T08:57:39Z","title":"Vertical Federated Learning Hybrid Local Pre-training","summary":" Vertical Federated Learning (VFL), which has a broad range of real-world\napplications, has received much attention in both academia and industry.\nEnterprises aspire to exploit more valuable features of the same users from\ndiverse departments to boost their model prediction skills. VFL addresses this\ndemand and concurrently secures individual parties from exposing their raw\ndata. However, conventional VFL encounters a bottleneck as it only leverages\naligned samples, whose size shrinks with more parties involved, resulting in\ndata scarcity and the waste of unaligned data. To address this problem, we\npropose a novel VFL Hybrid Local Pre-training (VFLHLP) approach. VFLHLP first\npre-trains local networks on the local data of participating parties. Then it\nutilizes these pre-trained networks to adjust the sub-model for the labeled\nparty or enhance representation learning for other parties during downstream\nfederated learning on aligned data, boosting the performance of federated\nmodels.\n","authors":["Wenguo Li","Xinling Guo","Xu Jiao","Tiancheng Huang","Xiaoran Yan","Yao Yang"],"pdf_url":"https://arxiv.org/pdf/2405.11884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09671v2","updated":"2024-05-20T08:38:18Z","published":"2024-02-04T07:21:45Z","title":"CoRaiS: Lightweight Real-Time Scheduler for Multi-Edge Cooperative\n Computing","summary":" Multi-edge cooperative computing that combines constrained resources of\nmultiple edges into a powerful resource pool has the potential to deliver great\nbenefits, such as a tremendous computing power, improved response time, more\ndiversified services. However, the mass heterogeneous resources composition and\nlack of scheduling strategies make the modeling and cooperating of multi-edge\ncomputing system particularly complicated. This paper first proposes a\nsystem-level state evaluation model to shield the complex hardware\nconfigurations and redefine the different service capabilities at heterogeneous\nedges. Secondly, an integer linear programming model is designed to cater for\noptimally dispatching the distributed arriving requests. Finally, a\nlearning-based lightweight real-time scheduler, CoRaiS, is proposed. CoRaiS\nembeds the real-time states of multi-edge system and requests information, and\ncombines the embeddings with a policy network to schedule the requests, so that\nthe response time of all requests can be minimized. Evaluation results verify\nthat CoRaiS can make a high-quality scheduling decision in real time, and can\nbe generalized to other multi-edge computing system, regardless of system\nscales. Characteristic validation also demonstrates that CoRaiS successfully\nlearns to balance loads, perceive real-time state and recognize heterogeneity\nwhile scheduling.\n","authors":["Yujiao Hu","Qingmin Jia","Jinchao Chen","Yuan Yao","Yan Pan","Renchao Xie","F. Richard Yu"],"pdf_url":"https://arxiv.org/pdf/2403.09671v2.pdf","comment":"Accepted by IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2206.07248v3","updated":"2024-05-20T08:32:03Z","published":"2022-06-15T02:20:40Z","title":"Blockchain based Secure Energy Marketplace Scheme to Motivate Peer to\n Peer Microgrids","summary":" In the past years trend of microgrids is increasing very fast to reduce\npeak-hour costs. However, in these systems, third parties are still involved in\nselling surplus energy. This results in increased cost of energy and there are\nmany operational and security barriers in such systems. These issues can be\nsolved by the decentralized distributed system of microgrids where a consumer\ncan locally sell their surplus energy to another consumer. To deploy such a\nsystem, one must consider security barriers for the transaction of energy. This\npaper proposes a solution to these problems by devising a scheme as a\nmarketplace where users interact with each other to buy and sell energy at\nbetter rates and get energy-generating resources on lease so that users do not\nhave to worry about capital investment. Agreement between owner of resources\nand consumer is recorded on blockchain based smart contracts. In this paper, a\nsurvey is performed for existing well known, decentralized energy solutions.\nThis paper also proposes an extra layer of security to leverage a shielded\nexecution environment so that information of energy generated, utilized, and\nshared cannot be changed by consumers and third parties even if the system is\ncompromised.\n","authors":["Muhammad Awais","Qamar Abbas","Shehbaz Tariq","Sayyaf Haider Warraich"],"pdf_url":"https://arxiv.org/pdf/2206.07248v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03181v2","updated":"2024-05-20T08:09:41Z","published":"2024-05-06T06:12:17Z","title":"Collaborative Satellite Computing through Adaptive DNN Task Splitting\n and Offloading","summary":" Satellite computing has emerged as a promising technology for next-generation\nwireless networks. This innovative technology provides data processing\ncapabilities, which facilitates the widespread implementation of artificial\nintelligence (AI)-based applications, especially for image processing tasks\ninvolving deep neural network (DNN). With the limited computing resources of an\nindividual satellite, independently handling DNN tasks generated by diverse\nuser equipments (UEs) becomes a significant challenge. One viable solution is\ndividing a DNN task into multiple subtasks and subsequently distributing them\nacross multiple satellites for collaborative computing. However, it is\nchallenging to partition DNN appropriately and allocate subtasks into suitable\nsatellites while ensuring load balancing. To this end, we propose a\ncollaborative satellite computing system designed to improve task processing\nefficiency in satellite networks. Based on this system, a workload-balanced\nadaptive task splitting scheme is developed to equitably distribute the\nworkload of DNN slices for collaborative inference, consequently enhancing the\nutilization of satellite computing resources. Additionally, a self-adaptive\ntask offloading scheme based on a genetic algorithm (GA) is introduced to\ndetermine optimal offloading decisions within dynamic network environments. The\nnumerical results illustrate that our proposal can outperform comparable\nmethods in terms of task completion rate, delay, and resource utilization.\n","authors":["Shifeng Peng","Xuefeng Hou","Zhishu Shen","Qiushi Zheng","Jiong Jin","Atsushi Tagami","Jingling Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.03181v2.pdf","comment":"Accepted by 29th IEEE Symposium on Computers and Communications\n (ISCC)"},{"id":"http://arxiv.org/abs/2405.11836v1","updated":"2024-05-20T07:15:41Z","published":"2024-05-20T07:15:41Z","title":"PLASMA -- Platform for Service Management in Digital Remote Maintenance\n Applications","summary":" To support maintenance and servicing of industrial machines, service\nprocesses are even today often performed manually and analogously, although\nsupportive technologies such as augmented reality, virtual reality and digital\nplatforms already exist. In many cases, neither technicians on-site nor remote\nexperts have all the essential information and options for suitable actions\navailable. Existing service products and platforms do not cover all the\nrequired functions in practice in order to map end-to-end processes. PLASMA is\na concept for a Cloud-based remote maintenance platform designed to meet these\ndemands. But for a real-life implementation of PLASMA, security measures are\nessential as we show in this paper.\n","authors":["Natascha Stumpp","Doris Aschenbrenner","Manuel Stahl","Andreas Aßmuth"],"pdf_url":"https://arxiv.org/pdf/2405.11836v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2405.11811v1","updated":"2024-05-20T06:12:33Z","published":"2024-05-20T06:12:33Z","title":"FedCAda: Adaptive Client-Side Optimization for Accelerated and Stable\n Federated Learning","summary":" Federated learning (FL) has emerged as a prominent approach for collaborative\ntraining of machine learning models across distributed clients while preserving\ndata privacy. However, the quest to balance acceleration and stability becomes\na significant challenge in FL, especially on the client-side. In this paper, we\nintroduce FedCAda, an innovative federated client adaptive algorithm designed\nto tackle this challenge. FedCAda leverages the Adam algorithm to adjust the\ncorrection process of the first moment estimate $m$ and the second moment\nestimate $v$ on the client-side and aggregate adaptive algorithm parameters on\nthe server-side, aiming to accelerate convergence speed and communication\nefficiency while ensuring stability and performance. Additionally, we\ninvestigate several algorithms incorporating different adjustment functions.\nThis comparative analysis revealed that due to the limited information\ncontained within client models from other clients during the initial stages of\nfederated learning, more substantial constraints need to be imposed on the\nparameters of the adaptive algorithm. As federated learning progresses and\nclients gather more global information, FedCAda gradually diminishes the impact\non adaptive parameters. These findings provide insights for enhancing the\nrobustness and efficiency of algorithmic improvements. Through extensive\nexperiments on computer vision (CV) and natural language processing (NLP)\ndatasets, we demonstrate that FedCAda outperforms the state-of-the-art methods\nin terms of adaptability, convergence, stability, and overall performance. This\nwork contributes to adaptive algorithms for federated learning, encouraging\nfurther exploration.\n","authors":["Liuzhi Zhou","Yu He","Kun Zhai","Xiang Liu","Sen Liu","Xingjun Ma","Guangnan Ye","Yu-Gang Jiang","Hongfeng Chai"],"pdf_url":"https://arxiv.org/pdf/2405.11811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07949v2","updated":"2024-05-20T18:02:32Z","published":"2024-05-13T17:25:40Z","title":"Online Load and Graph Balancing for Random Order Inputs","summary":" Online load balancing for heterogeneous machines aims to minimize the\nmakespan (maximum machine workload) by scheduling arriving jobs with varying\nsizes on different machines. In the adversarial setting, where an adversary\nchooses not only the collection of job sizes but also their arrival order, the\nproblem is well-understood and the optimal competitive ratio is known to be\n$\\Theta(\\log m)$ where $m$ is the number of machines. In the more realistic\nrandom arrival order model, the understanding is limited. Previously, the best\nlower bound on the competitive ratio was only $\\Omega(\\log \\log m)$.\n We significantly improve this bound by showing an $\\Omega( \\sqrt {\\log m})$\nlower bound, even for the restricted case where each job has a unit size on two\nmachines and infinite size on the others. On the positive side, we propose an\n$O(\\log m/\\log \\log m)$-competitive algorithm, demonstrating that better\nperformance is possible in the random arrival model.\n","authors":["Sungjin Im","Ravi Kumar","Shi Li","Aditya Petety","Manish Purohit"],"pdf_url":"https://arxiv.org/pdf/2405.07949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02384v4","updated":"2024-05-20T20:18:42Z","published":"2023-03-04T11:30:16Z","title":"Hierarchical Training of Deep Neural Networks Using Early Exiting","summary":" Deep neural networks provide state-of-the-art accuracy for vision tasks but\nthey require significant resources for training. Thus, they are trained on\ncloud servers far from the edge devices that acquire the data. This issue\nincreases communication cost, runtime and privacy concerns. In this study, a\nnovel hierarchical training method for deep neural networks is proposed that\nuses early exits in a divided architecture between edge and cloud workers to\nreduce the communication cost, training runtime and privacy concerns. The\nmethod proposes a brand-new use case for early exits to separate the backward\npass of neural networks between the edge and the cloud during the training\nphase. We address the issues of most available methods that due to the\nsequential nature of the training phase, cannot train the levels of hierarchy\nsimultaneously or they do it with the cost of compromising privacy. In\ncontrast, our method can use both edge and cloud workers simultaneously, does\nnot share the raw input data with the cloud and does not require communication\nduring the backward pass. Several simulations and on-device experiments for\ndifferent neural network architectures demonstrate the effectiveness of this\nmethod. It is shown that the proposed method reduces the training runtime for\nVGG-16 and ResNet-18 architectures by 29% and 61% in CIFAR-10 classification\nand by 25% and 81% in Tiny ImageNet classification when the communication with\nthe cloud is done over a low bit rate channel. This gain in the runtime is\nachieved whilst the accuracy drop is negligible. This method is advantageous\nfor online learning of high-accuracy deep neural networks on sensor-holding\nlow-resource devices such as mobile phones or robots as a part of an edge-cloud\nsystem, making them more flexible in facing new tasks and classes of data.\n","authors":["Yamin Sepehri","Pedram Pad","Ahmet Caner Yüzügüler","Pascal Frossard","L. Andrea Dunbar"],"pdf_url":"https://arxiv.org/pdf/2303.02384v4.pdf","comment":"Accepted to IEEE Transactions on Neural Networks and Learning Systems\n (2024), 15 pages, 10 figures, 3 Tables"},{"id":"http://arxiv.org/abs/2405.12322v1","updated":"2024-05-20T18:40:27Z","published":"2024-05-20T18:40:27Z","title":"Securing Blockchain-based IoT Systems with Physical Unclonable Functions\n and Zero-Knowledge Proofs","summary":" This paper presents a framework for securing blockchain-based IoT systems by\nintegrating Physical Unclonable Functions (PUFs) and Zero-Knowledge Proofs\n(ZKPs) within a Hyperledger Fabric environment. The proposed framework\nleverages PUFs for unique device identification and ZKPs for privacy-preserving\nauthentication and transaction processing. Experimental results demonstrate the\nframework's feasibility, performance, and security against various attacks.\nThis framework provides a comprehensive solution for addressing the security\nchallenges in blockchain-based IoT systems.\n","authors":["Daniel Commey","Sena Hounsinou","Garth V. Crosby"],"pdf_url":"https://arxiv.org/pdf/2405.12322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12311v1","updated":"2024-05-20T18:14:31Z","published":"2024-05-20T18:14:31Z","title":"Cost-Optimal Microservices Deployment with Cluster Autoscaling and Spot\n Pricing","summary":" Microservices architecture has been established as an ideal software\narchitecture for cloud-based software development and deployment, offering many\nbenefits such as agility and efficiency. Microservices are often associated\nwith containers and container orchestration systems for deployment, as\ncontainerization provides convenient tools and techniques for resource\nmanagement, including the automation of orchestration processes. Among the\nfactors that make the cloud suitable for commercial software deployment,\ntransient pricing options like AWS Spot Pricing are particularly attractive as\nthey allow consumers to significantly reduce cloud costs. However, the dynamic\nnature of resource demand and the abrupt termination of spot VMs make transient\npricing challenging. Nonetheless, containerization and container orchestration\nsystems open new avenues to optimize the cost of microservices deployments by\nleveraging spot pricing on the public cloud while achieving application and\nbusiness goals.\n We propose SpotKube, an open-source, Kubernetes-based, application-aware,\ngenetic algorithm-based solution for cost optimization, which autoscales\nclusters for microservices-based applications hosted on public clouds with spot\npricing options. SpotKube analyzes application characteristics and recommends\nthe optimal configuration for resource allocation to the cluster. It consists\nof an elastic cluster autoscaler powered by an optimization algorithm that\nensures cost-effective microservices deployment while meeting application\nperformance requirements and handling abrupt termination of nodes, thereby\nminimizing the impact on system availability. We implement and evaluate\nSpotKube with representative microservices-based applications in a real public\ncloud setup, demonstrating the effectiveness of our approach against\nalternative optimization strategies.\n","authors":["Dasith Edirisinghe","Kavinda Rajapakse","Pasindu Abeysinghe","Sunimal Rathnayake"],"pdf_url":"https://arxiv.org/pdf/2405.12311v1.pdf","comment":"11 pages including references, 11 figures, Keywords: Microservice,\n Cost optimization, Cluster Autoscaling, Transient Pricing"},{"id":"http://arxiv.org/abs/2405.13066v1","updated":"2024-05-20T16:14:39Z","published":"2024-05-20T16:14:39Z","title":"Practical Performance of a Distributed Processing Framework for\n Machine-Learning-based NIDS","summary":" Network Intrusion Detection Systems (NIDSs) detect intrusion attacks in\nnetwork traffic. In particular, machine-learning-based NIDSs have attracted\nattention because of their high detection rates of unknown attacks. A\ndistributed processing framework for machine-learning-based NIDSs employing a\nscalable distributed stream processing system has been proposed in the\nliterature. However, its performance, when machine-learning-based classifiers\nare implemented has not been comprehensively evaluated. In this study, we\nimplement five representative classifiers (Decision Tree, Random Forest, Naive\nBayes, SVM, and kNN) based on this framework and evaluate their throughput and\nlatency. By conducting the experimental measurements, we investigate the\ndifference in the processing performance among these classifiers and the\nbottlenecks in the processing performance of the framework.\n","authors":["Maho Kajiura","Junya Nakamura"],"pdf_url":"https://arxiv.org/pdf/2405.13066v1.pdf","comment":"This paper was accepted at the 14th IEEE International Workshop on\n Network Technologies for Security, Administration & Protection (NETSAP 2024)"},{"id":"http://arxiv.org/abs/2405.13062v1","updated":"2024-05-20T14:41:59Z","published":"2024-05-20T14:41:59Z","title":"StatAvg: Mitigating Data Heterogeneity in Federated Learning for\n Intrusion Detection Systems","summary":" Federated learning (FL) is a decentralized learning technique that enables\nparticipating devices to collaboratively build a shared Machine Leaning (ML) or\nDeep Learning (DL) model without revealing their raw data to a third party. Due\nto its privacy-preserving nature, FL has sparked widespread attention for\nbuilding Intrusion Detection Systems (IDS) within the realm of cybersecurity.\nHowever, the data heterogeneity across participating domains and entities\npresents significant challenges for the reliable implementation of an FL-based\nIDS. In this paper, we propose an effective method called Statistical Averaging\n(StatAvg) to alleviate non-independently and identically (non-iid) distributed\nfeatures across local clients' data in FL. In particular, StatAvg allows the FL\nclients to share their individual data statistics with the server, which then\naggregates this information to produce global statistics. The latter are shared\nwith the clients and used for universal data normalisation. It is worth\nmentioning that StatAvg can seamlessly integrate with any FL aggregation\nstrategy, as it occurs before the actual FL training process. The proposed\nmethod is evaluated against baseline approaches using datasets for network and\nhost Artificial Intelligence (AI)-powered IDS. The experimental results\ndemonstrate the efficiency of StatAvg in mitigating non-iid feature\ndistributions across the FL clients compared to the baseline methods.\n","authors":["Pavlos S. Bouzinis","Panagiotis Radoglou-Grammatikis","Ioannis Makris","Thomas Lagkas","Vasileios Argyriou","Georgios Th. Papadopoulos","Panagiotis Sarigiannidis","George K. Karagiannidis"],"pdf_url":"https://arxiv.org/pdf/2405.13062v1.pdf","comment":"10 pages, 8 figures"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2402.03103v2","updated":"2024-05-20T10:28:52Z","published":"2024-02-05T15:31:41Z","title":"Scoped Effects as Parameterized Algebraic Theories","summary":" Notions of computation can be modelled by monads. Algebraic effects offer a\ncharacterization of monads in terms of algebraic operations and equational\naxioms, where operations are basic programming features, such as reading or\nupdating the state, and axioms specify observably equivalent expressions.\nHowever, many useful programming features depend on additional mechanisms such\nas delimited scopes or dynamically allocated resources. Such mechanisms can be\nsupported via extensions to algebraic effects including scoped effects and\nparameterized algebraic theories. We present a fresh perspective on scoped\neffects by translation into a variation of parameterized algebraic theories.\nThe translation enables a new approach to equational reasoning for scoped\neffects and gives rise to an alternative characterization of monads in terms of\ngenerators and equations involving both scoped and algebraic operations. We\ndemonstrate the power of our fresh perspective by way of equational\ncharacterizations of several known models of scoped effects.\n","authors":["Cristina Matache","Sam Lindley","Sean Moss","Sam Staton","Nicolas Wu","Zhixuan Yang"],"pdf_url":"https://arxiv.org/pdf/2402.03103v2.pdf","comment":"Extended version of the ESOP 2024 paper with the same title"},{"id":"http://arxiv.org/abs/2101.08181v3","updated":"2024-05-20T10:36:14Z","published":"2021-01-20T15:29:27Z","title":"Fair Asynchronous Session Subtyping","summary":" Session types are widely used as abstractions of asynchronous message passing\nsystems. Refinement for such abstractions is crucial as it allows improvements\nof a given component without compromising its compatibility with the rest of\nthe system. In the context of session types, the most general notion of\nrefinement is asynchronous session subtyping, which allows message emissions to\nbe anticipated w.r.t. a bounded amount of message consumptions. In this paper\nwe investigate the possibility to anticipate emissions w.r.t. an unbounded\namount of consumptions: to this aim we propose to consider fair compliance over\nasynchronous session types and fair refinement as the relation that preserves\nit. This allows us to propose a novel variant of session subtyping that\nleverages the notion of controllability from service contract theory and that\nis a sound characterisation of fair refinement. In addition, we show that both\nfair refinement and our novel subtyping are undecidable. We also present a\nsound algorithm which deals with examples that feature potentially unbounded\nbuffering. Finally, we present an implementation of our algorithm and an\nempirical evaluation of it on synthetic benchmarks.\n","authors":["Mario Bravetti","Julien Lange","Gianluigi Zavattaro"],"pdf_url":"https://arxiv.org/pdf/2101.08181v3.pdf","comment":null}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2405.12034v1","updated":"2024-05-20T14:01:38Z","published":"2024-05-20T14:01:38Z","title":"Count-Min Sketch with Conservative Updates: Worst-Case Analysis","summary":" Count-Min Sketch with Conservative Updates (\\texttt{CMS-CU}) is a\nmemory-efficient hash-based data structure used to estimate the occurrences of\nitems within a data stream. \\texttt{CMS-CU} stores~$m$ counters and employs~$d$\nhash functions to map items to these counters. We first argue that the\nestimation error in \\texttt{CMS-CU} is maximal when each item appears at most\nonce in the stream. Next, we study \\texttt{CMS-CU} in this setting. Precisely,\n\\begin{enumerate}\n \\item In the case where~$d=m-1$, we prove that the average estimation error\nand the average counter rate converge almost surely to~$\\frac{1}{2}$,\ncontrasting with the vanilla Count-Min Sketch, where the average counter rate\nis equal to~$\\frac{m-1}{m}$.\n \\item For any given~$m$ and~$d$, we prove novel lower and upper bounds on the\naverage estimation error, incorporating a positive integer parameter~$g$.\nLarger values of this parameter improve the accuracy of the bounds. Moreover,\nthe computation of each bound involves examining an ergodic Markov process with\na state space of size~$\\binom{m+g-d}{g}$ and a sparse transition probabilities\nmatrix containing~$\\mathcal{O}(m\\binom{m+g-d}{g})$ non-zero entries.\n \\item For~$d=m-1$, $g=1$, and as $m\\to \\infty$, we show that the lower and\nupper bounds coincide. In general, our bounds exhibit high accuracy for small\nvalues of $g$, as shown by numerical computation. For example, for $m=50$,\n$d=4$, and $g=5$, the difference between the lower and upper bounds is smaller\nthan~$10^{-4}$.\n \\end{enumerate}\n","authors":["Younes Ben Mazziane","Othmane Marfoq"],"pdf_url":"https://arxiv.org/pdf/2405.12034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11927v1","updated":"2024-05-20T10:06:19Z","published":"2024-05-20T10:06:19Z","title":"Response time in a pair of processor sharing queues with\n Join-the-Shortest-Queue scheduling","summary":" Join-the-Shortest-Queue (JSQ) is the scheduling policy of choice for many\nnetwork providers, cloud servers and traffic management systems, where\nindividual queues are served under processor sharing (PS) queueing discipline.\nA numerical solution for the response time distribution in two parallel PS\nqueues with JSQ scheduling is derived for the first time. Using the generating\nfunction method, two partial differential equations (PDEs) are obtained\ncorresponding to conditional response times, where the conditioning is on a\nparticular traced task joining the first or the second queue. These PDEs are\nfunctional equations that contain partial generating functions and their\npartial derivatives, and therefore cannot be solved by commonly used\ntechniques. We are able to solve these PDEs numerically with good accuracy and\nperform the deconditioning with respect to the queue-length probabilities by\nevaluating a certain complex integral. Numerical results for the density and\nthe first four moments compare well against regenerative simulation with\n500,000 regeneration cycles.\n","authors":["Julianna Bor","Peter G Harrison"],"pdf_url":"https://arxiv.org/pdf/2405.11927v1.pdf","comment":null}],"Operation Systems":[{"id":"http://arxiv.org/abs/2405.12079v1","updated":"2024-05-20T14:49:45Z","published":"2024-05-20T14:49:45Z","title":"PARALLELGPUOS: A Concurrent OS-level GPU Checkpoint and Restore System\n using Validated Speculation","summary":" Checkpointing (C) and restoring (R) are key components for GPU tasks. POS is\nan OS-level GPU C/R system: It can transparently checkpoint or restore\nprocesses that use the GPU, without requiring any cooperation from the\napplication, a key feature required by modern systems like the cloud. Moreover,\nPOS is the first OS-level C/R system that can concurrently execute C/R with the\napplication execution: a critical feature that can be trivially achieved when\nthe processes only running on the CPU, but becomes challenging when the\nprocesses use GPU. The problem is how to ensure consistency during concurrent\nexecution with the lack of application semantics due to transparency. CPU\nprocesses can leverage OS and hardware paging to fix inconsistency without\napplication semantics. Unfortunately, GPU bypasses OS and paging for high\nperformance. POS fills the semantic gap by speculatively extracting buffer\naccess information of GPU kernels during runtime. Thanks to the simple and\nwell-structured nature of GPU kernels, our speculative extraction (with runtime\nvalidation) achieves 100% accuracy on applications from training to inference\nwhose domains span from vision, large language models, and reinforcement\nlearning. Based on the extracted semantics, we systematically overlap C/R with\napplication execution, and achieves orders of magnitude higher performance\nunder various tasks compared with the state-of-the-art OS-level GPU C/R,\nincluding training fault tolerance, live GPU process migration, and cold starts\nacceleration in GPU-based serverless computing.\n","authors":["Zhuobin Huang","Xingda Wei","Yingyi Hao","Rong Chen","Mingcong Han","Jinyu Gu","Haibo Chen"],"pdf_url":"https://arxiv.org/pdf/2405.12079v1.pdf","comment":null}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2402.08434v2","updated":"2024-05-20T16:44:06Z","published":"2024-02-13T13:03:49Z","title":"Solving promise equations over monoids and groups","summary":" We give a complete complexity classification for the problem of finding a\nsolution to a given system of equations over a fixed finite monoid, given that\na solution over a more restricted monoid exists. As a corollary, we obtain a\ncomplexity classification for the same problem over groups.\n","authors":["Alberto Larrauri","Stanislav Živný"],"pdf_url":"https://arxiv.org/pdf/2402.08434v2.pdf","comment":"Full version of an ICALP 2024 paper"},{"id":"http://arxiv.org/abs/2405.12085v1","updated":"2024-05-20T14:55:20Z","published":"2024-05-20T14:55:20Z","title":"Noise-tolerant learnability of shallow quantum circuits from statistics\n and the cost of quantum pseudorandomness","summary":" This work studies the learnability of unknown quantum circuits in the near\nterm. We prove the natural robustness of quantum statistical queries for\nlearning quantum processes and provide an efficient way to benchmark various\nclasses of noise from statistics, which gives us a powerful framework for\ndeveloping noise-tolerant algorithms. We adapt a learning algorithm for\nconstant-depth quantum circuits to the quantum statistical query setting with a\nsmall overhead in the query complexity. We prove average-case lower bounds for\nlearning random quantum circuits of logarithmic and higher depths within\ndiamond distance with statistical queries. Additionally, we show the hardness\nof the quantum threshold search problem from quantum statistical queries and\ndiscuss its implications for the learnability of shallow quantum circuits.\nFinally, we prove that pseudorandom unitaries (PRUs) cannot be constructed\nusing circuits of constant depth by constructing an efficient distinguisher and\nproving a new variation of the quantum no-free lunch theorem.\n","authors":["Chirag Wadhwa","Mina Doosti"],"pdf_url":"https://arxiv.org/pdf/2405.12085v1.pdf","comment":"23+7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2212.03457v2","updated":"2024-05-20T00:18:22Z","published":"2022-12-07T04:46:26Z","title":"Partial gathering of mobile agents in dynamic rings","summary":" In this paper, we consider the partial gathering problem of mobile agents in\nsynchronous dynamic bidirectional ring networks. When k agents are distributed\nin the network, the partial gathering problem requires, for a given positive\ninteger g (< k), that agents terminate in a configuration such that either at\nleast g agents or no agent exists at each node. So far, the partial gathering\nproblem has been considered in static graphs. In this paper, we start\nconsidering partial gathering in dynamic graphs. As a first step, we consider\nthis problem in 1-interval connected rings, that is, one of the links in a ring\nmay be missing at each time step. In such networks, focusing on the\nrelationship between the values of k and g, we fully characterize the\nsolvability of the partial gathering problem and analyze the move complexity of\nthe proposed algorithms when the problem can be solved. First, we show that the\ng-partial gathering problem is unsolvable when k <= 2g. Second, we show that\nthe problem can be solved with O(n log g) time and the total number of O(gn log\ng) moves when 2g + 1 <= k <= 3g - 2. Third, we show that the problem can be\nsolved with O(n) time and the total number of O(kn) moves when 3g - 1 <= k <=\n8g - 4. Notice that since k = O(g) holds when 3g - 1 <= k <= 8g - 4, the move\ncomplexity O(kn) in this case can be represented also as O(gn). Finally, we\nshow that the problem can be solved with O(n) time and the total number of\nO(gn) moves when k >= 8g - 3. These results mean that the partial gathering\nproblem can be solved also in dynamic rings when k >= 2g + 1. In addition,\nagents require a total number of \\Omega(gn) moves to solve the partial (resp.,\ntotal) gathering problem. Thus, when k >= 3g - 1, agents can solve the partial\ngathering problem with the asymptotically optimal total number of O(gn) moves.\n","authors":["Masahiro Shibata","Yuichi Sudo","Junya Nakamura","Yonghwan Kim"],"pdf_url":"https://arxiv.org/pdf/2212.03457v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15728v3","updated":"2024-05-20T19:09:45Z","published":"2023-08-30T03:11:42Z","title":"Computational Lower Bounds for Graphon Estimation via Low-degree\n Polynomials","summary":" Graphon estimation has been one of the most fundamental problems in network\nanalysis and has received considerable attention in the past decade. From the\nstatistical perspective, the minimax error rate of graphon estimation has been\nestablished by Gao et al (2015) for both stochastic block model and\nnonparametric graphon estimation. The statistical optimal estimators are based\non constrained least squares and have computational complexity exponential in\nthe dimension. From the computational perspective, the best-known\npolynomial-time estimator is based universal singular value thresholding, but\nit can only achieve a much slower estimation error rate than the minimax one.\nThe computational optimality of the USVT or the existence of a computational\nbarrier in graphon estimation has been a long-standing open problem. In this\nwork, we provide rigorous evidence for the computational barrier in graphon\nestimation via low-degree polynomials. Specifically, in SBM graphon estimation,\nwe show that for low-degree polynomial estimators, their estimation error rates\ncannot be significantly better than that of the USVT under a wide range of\nparameter regimes and in nonparametric graphon estimation, we show low-degree\npolynomial estimators achieve estimation error rates strictly slower than the\nminimax rate. Our results are proved based on the recent development of\nlow-degree polynomials by Schramm and Wein (2022), while we overcome a few key\nchallenges in applying it to the general graphon estimation problem. By\nleveraging our main results, we also provide a computational lower bound on the\nclustering error for community detection in SBM with a growing number of\ncommunities and this yields a new piece of evidence for the conjectured\nKesten-Stigum threshold for efficient community recovery. Finally, we extend\nour computational lower bounds to sparse graphon estimation and biclustering.\n","authors":["Yuetian Luo","Chao Gao"],"pdf_url":"https://arxiv.org/pdf/2308.15728v3.pdf","comment":"Add low-degree upper bound in v2"}]},"2024-05-21T00:00:00Z":{"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2405.11884v2","updated":"2024-05-21T07:46:03Z","published":"2024-05-20T08:57:39Z","title":"Vertical Federated Learning Hybrid Local Pre-training","summary":" Vertical Federated Learning (VFL), which has a broad range of real-world\napplications, has received much attention in both academia and industry.\nEnterprises aspire to exploit more valuable features of the same users from\ndiverse departments to boost their model prediction skills. VFL addresses this\ndemand and concurrently secures individual parties from exposing their raw\ndata. However, conventional VFL encounters a bottleneck as it only leverages\naligned samples, whose size shrinks with more parties involved, resulting in\ndata scarcity and the waste of unaligned data. To address this problem, we\npropose a novel VFL Hybrid Local Pre-training (VFLHLP) approach. VFLHLP first\npre-trains local networks on the local data of participating parties. Then it\nutilizes these pre-trained networks to adjust the sub-model for the labeled\nparty or enhance representation learning for other parties during downstream\nfederated learning on aligned data, boosting the performance of federated\nmodels. The experimental results on real-world advertising datasets,\ndemonstrate that our approach achieves the best performance over baseline\nmethods by large margins. The ablation study further illustrates the\ncontribution of each technique in VFLHLP to its overall performance.\n","authors":["Wenguo Li","Xinling Guo","Xu Jiao","Tiancheng Huang","Xiaoran Yan","Yao Yang"],"pdf_url":"https://arxiv.org/pdf/2405.11884v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11440v2","updated":"2024-05-21T12:52:38Z","published":"2024-05-19T04:23:40Z","title":"A GAN-Based Data Poisoning Attack Against Federated Learning Systems and\n Its Countermeasure","summary":" As a distributed machine learning paradigm, federated learning (FL) is\ncollaboratively carried out on privately owned datasets but without direct data\naccess. Although the original intention is to allay data privacy concerns,\n\"available but not visible\" data in FL potentially brings new security threats,\nparticularly poisoning attacks that target such \"not visible\" local data.\nInitial attempts have been made to conduct data poisoning attacks against FL\nsystems, but cannot be fully successful due to their high chance of causing\nstatistical anomalies. To unleash the potential for truly \"invisible\" attacks\nand build a more deterrent threat model, in this paper, a new data poisoning\nattack model named VagueGAN is proposed, which can generate seemingly\nlegitimate but noisy poisoned data by untraditionally taking advantage of\ngenerative adversarial network (GAN) variants. Capable of manipulating the\nquality of poisoned data on demand, VagueGAN enables to trade-off attack\neffectiveness and stealthiness. Furthermore, a cost-effective countermeasure\nnamed Model Consistency-Based Defense (MCD) is proposed to identify\nGAN-poisoned data or models after finding out the consistency of GAN outputs.\nExtensive experiments on multiple datasets indicate that our attack method is\ngenerally much more stealthy as well as more effective in degrading FL\nperformance with low complexity. Our defense method is also shown to be more\ncompetent in identifying GAN-poisoned data or models. The source codes are\npublicly available at\n\\href{https://github.com/SSssWEIssSS/VagueGAN-Data-Poisoning-Attack-and-Its-Countermeasure}{https://github.com/SSssWEIssSS/VagueGAN-Data-Poisoning-Attack-and-Its-Countermeasure}.\n","authors":["Wei Sun","Bo Gao","Ke Xiong","Yuwei Wang"],"pdf_url":"https://arxiv.org/pdf/2405.11440v2.pdf","comment":"18 pages, 16 figures"},{"id":"http://arxiv.org/abs/2205.14797v2","updated":"2024-05-21T17:03:23Z","published":"2022-05-30T00:50:28Z","title":"Near Optimal Bounds for Replacement Paths and Related Problems in the\n CONGEST Model","summary":" We present several results in the CONGEST model on round complexity for\nReplacement Paths (RPaths), Minimum Weight Cycle (MWC), and All Nodes Shortest\nCycles (ANSC). We study these fundamental problems in both directed and\nundirected graphs, both weighted and unweighted. Many of our results are\noptimal to within a polylog factor: For an $n$-node graph $G$ we establish near\nlinear lower and upper bounds for computing RPaths if $G$ is directed and\nweighted, and for computing MWC and ANSC if $G$ is weighted, directed or\nundirected; near $\\sqrt{n}$ lower and upper bounds for undirected weighted\nRPaths; and $\\Theta(D)$ bound for undirected unweighted RPaths. We also present\nlower and upper bounds for approximation versions of these problems, notably a\n$(2-(1/g))$-approximation algorithm for undirected unweighted MWC that runs in\n$\\tilde{O}(\\sqrt{n}+D)$ rounds, improving on the previous best bound of\n$\\tilde{O}(\\sqrt{ng}+D)$ rounds, where $g$ is the MWC length. We present a\n$(1+\\epsilon)$-approximation algorithm for directed weighted RPaths, which\nbeats the linear lower bound for exact RPaths.\n","authors":["Vignesh Manoharan","Vijaya Ramachandran"],"pdf_url":"https://arxiv.org/pdf/2205.14797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13101v2","updated":"2024-05-21T16:59:06Z","published":"2024-03-19T19:05:24Z","title":"AdaptSFL: Adaptive Split Federated Learning in Resource-constrained Edge\n Networks","summary":" The increasing complexity of deep neural networks poses significant barriers\nto democratizing them to resource-limited edge devices. To address this\nchallenge, split federated learning (SFL) has emerged as a promising solution\nby of floading the primary training workload to a server via model partitioning\nwhile enabling parallel training among edge devices. However, although system\noptimization substantially influences the performance of SFL under\nresource-constrained systems, the problem remains largely uncharted. In this\npaper, we provide a convergence analysis of SFL which quantifies the impact of\nmodel splitting (MS) and client-side model aggregation (MA) on the learning\nperformance, serving as a theoretical foundation. Then, we propose AdaptSFL, a\nnovel resource-adaptive SFL framework, to expedite SFL under\nresource-constrained edge computing systems. Specifically, AdaptSFL adaptively\ncontrols client-side MA and MS to balance communication-computing latency and\ntraining convergence. Extensive simulations across various datasets validate\nthat our proposed AdaptSFL framework takes considerably less time to achieve a\ntarget accuracy than benchmarks, demonstrating the effectiveness of the\nproposed strategies.\n","authors":["Zheng Lin","Guanqiao Qu","Wei Wei","Xianhao Chen","Kin K. Leung"],"pdf_url":"https://arxiv.org/pdf/2403.13101v2.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2405.12894v1","updated":"2024-05-21T16:04:32Z","published":"2024-05-21T16:04:32Z","title":"Decentralized Federated Learning Over Imperfect Communication Channels","summary":" This paper analyzes the impact of imperfect communication channels on\ndecentralized federated learning (D-FL) and subsequently determines the optimal\nnumber of local aggregations per training round, adapting to the network\ntopology and imperfect channels. We start by deriving the bias of locally\naggregated D-FL models under imperfect channels from the ideal global models\nrequiring perfect channels and aggregations. The bias reveals that excessive\nlocal aggregations can accumulate communication errors and degrade convergence.\nAnother important aspect is that we analyze a convergence upper bound of D-FL\nbased on the bias. By minimizing the bound, the optimal number of local\naggregations is identified to balance a trade-off with accumulation of\ncommunication errors in the absence of knowledge of the channels. With this\nknowledge, the impact of communication errors can be alleviated, allowing the\nconvergence upper bound to decrease throughout aggregations. Experiments\nvalidate our convergence analysis and also identify the optimal number of local\naggregations on two widely considered image classification tasks. It is seen\nthat D-FL, with an optimal number of local aggregations, can outperform its\npotential alternatives by over 10% in training accuracy.\n","authors":["Weicai Li","Tiejun Lv","Wei Ni","Jingbo Zhao","Ekram Hossain","H. Vincent Poor"],"pdf_url":"https://arxiv.org/pdf/2405.12894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12852v1","updated":"2024-05-21T15:11:11Z","published":"2024-05-21T15:11:11Z","title":"Application Layer Cyber Deception without Developer Interaction","summary":" Cyber deception techniques that are tightly intertwined with applications\npose significant technical challenges in production systems. Security measures\nare usually the responsibility of a system operator, but they are typically\nlimited to accessing built software artifacts, not their source code. This\nlimitation makes it particularly challenging to deploy cyber deception\ntechniques at application runtime and without full control over the software\ndevelopment lifecycle. This work reviews 19 technical methods to accomplish\nthis and evaluates them based on technical, topological, operational, and\nefficacy properties. We find some novel techniques beyond honeypots and reverse\nproxies that seem to have received little research interest despite their\npromise for cyber deception. We believe that overcoming these technical\nchallenges can drive the adoption of more dynamic and personalized cyber\ndeception techniques, tailored to specific classes of applications.\n","authors":["Mario Kahlhofer","Stefan Rass"],"pdf_url":"https://arxiv.org/pdf/2405.12852v1.pdf","comment":"to be published in the 3rd Workshop on Active Defense and Deception\n (ADnD 2024)"},{"id":"http://arxiv.org/abs/2405.12678v1","updated":"2024-05-21T11:11:13Z","published":"2024-05-21T11:11:13Z","title":"Sorting in One and Two Rounds using $t$-Comparators","summary":" We examine sorting algorithms for $n$ elements whose basic operation is\ncomparing $t$ elements simultaneously (a $t$-comparator). We focus on\nalgorithms that use only a single round or two rounds -- comparisons performed\nin the second round depend on the outcomes of the first round comparators.\n We design deterministic and randomized algorithms. In the deterministic case,\nwe show an interesting relation to design theory (namely, to 2-Steiner\nsystems), which yields a single-round optimal algorithm for $n=t^{2^k}$ with\nany $k\\ge 1$ and a variety of possible values of $t$. For some values of $t$,\nhowever, no algorithm can reach the optimal (information-theoretic) bound on\nthe number of comparators. For this case (and any other $n$ and $t$), we show\nan algorithm that uses at most three times as many comparators as the\ntheoretical bound.\n We also design a randomized Las-Vegas two-rounds sorting algorithm for any\n$n$ and $t$. Our algorithm uses an asymptotically optimal number of\n$O(\\max(\\frac{n^{3/2}}{t^2},\\frac{n}{t}))$ comparators, with high probability,\ni.e., with probability at least $1-1/n$. The analysis of this algorithm\ninvolves the gradual unveiling of randomness, using a novel technique which we\ncoin the binary tree of deferred randomness.\n","authors":["Ran Gelles","Zvi Lotker","Frederik Mallmann-Trenn"],"pdf_url":"https://arxiv.org/pdf/2405.12678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12635v1","updated":"2024-05-21T09:39:55Z","published":"2024-05-21T09:39:55Z","title":"TempoScale: A Cloud Workloads Prediction Approach Integrating Short-Term\n and Long-Term Information","summary":" Cloud native solutions are widely applied in various fields, placing higher\ndemands on the efficient management and utilization of resource platforms. To\nachieve the efficiency, load forecasting and elastic scaling have become\ncrucial technologies for dynamically adjusting cloud resources to meet user\ndemands and minimizing resource waste. However, existing prediction-based\nmethods lack comprehensive analysis and integration of load characteristics\nacross different time scales. For instance, long-term trend analysis helps\nreveal long-term changes in load and resource demand, thereby supporting\nproactive resource allocation over longer periods, while short-term volatility\nanalysis can examine short-term fluctuations in load and resource demand,\nproviding support for real-time scheduling and rapid response. In response to\nthis, our research introduces TempoScale, which aims to enhance the\ncomprehensive understanding of temporal variations in cloud workloads, enabling\nmore intelligent and adaptive decision-making for elastic scaling. TempoScale\nutilizes the Complete Ensemble Empirical Mode Decomposition with Adaptive Noise\nalgorithm to decompose time-series load data into multiple Intrinsic Mode\nFunctions (IMF) and a Residual Component (RC). First, we integrate the IMF,\nwhich represents both long-term trends and short-term fluctuations, into the\ntime series prediction model to obtain intermediate results. Then, these\nintermediate results, along with the RC, are transferred into a fully connected\nlayer to obtain the final result. Finally, this result is fed into the resource\nmanagement system based on Kubernetes for resource scaling. Our proposed\napproach can reduce the Mean Square Error by 5.80% to 30.43% compared to the\nbaselines, and reduce the average response time by 5.58% to 31.15%.\n","authors":["Linfeng Wen","Minxian Xu","Adel N. Toosi","Kejiang Ye"],"pdf_url":"https://arxiv.org/pdf/2405.12635v1.pdf","comment":"11pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.12590v1","updated":"2024-05-21T08:34:39Z","published":"2024-05-21T08:34:39Z","title":"Maverick-Aware Shapley Valuation for Client Selection in Federated\n Learning","summary":" Federated Learning (FL) allows clients to train a model collaboratively\nwithout sharing their private data. One key challenge in practical FL systems\nis data heterogeneity, particularly in handling clients with rare data, also\nreferred to as Mavericks. These clients own one or more data classes\nexclusively, and the model performance becomes poor without their\nparticipation. Thus, utilizing Mavericks throughout training is crucial. In\nthis paper, we first design a Maverick-aware Shapley valuation that fairly\nevaluates the contribution of Mavericks. The main idea is to compute the\nclients' Shapley values (SV) class-wise, i.e., per label. Next, we propose\nFedMS, a Maverick-Shapley client selection mechanism for FL that intelligently\nselects the clients that contribute the most in each round, by employing our\nMaverick-aware SV-based contribution score. We show that, compared to an\nextensive list of baselines, FedMS achieves better model performance and fairer\nShapley Rewards distribution.\n","authors":["Mengwei Yang","Ismat Jarin","Baturalp Buyukates","Salman Avestimehr","Athina Markopoulou"],"pdf_url":"https://arxiv.org/pdf/2405.12590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12582v1","updated":"2024-05-21T08:26:38Z","published":"2024-05-21T08:26:38Z","title":"Carbon-aware Software Services","summary":" The significant carbon footprint of the ICT sector calls for methodologies to\ncontain carbon emissions of running software. This article proposes a novel\nframework for implementing, configuring and assessing carbon-aware interactive\nsoftware services. First, we propose a methodology to implement carbon-aware\nservices leveraging the Strategy design pattern to feature alternative service\nversions with different energy consumption. Then, we devise a bilevel\noptimisation scheme to configure which version to use at different times of the\nday, based on forecasts of carbon intensity and service requests, pursuing the\ntwo-fold goal of minimising carbon emissions and maintaining average output\nquality above a desired set-point. Last, an open-source prototype of such\noptimisation scheme is used to configure a software service implemented as per\nour methodology and assessed against traditional non-adaptive implementations\nof the same service. Results show the capability of our framework to control\nthe average quality of output results of carbon-aware services and to reduce\ncarbon emissions from 8% to 50%.\n","authors":["Stefano Forti","Jacopo Soldani","Antonio Brogi"],"pdf_url":"https://arxiv.org/pdf/2405.12582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12525v1","updated":"2024-05-21T06:30:00Z","published":"2024-05-21T06:30:00Z","title":"Cache Blocking of Distributed-Memory Parallel Matrix Power Kernels","summary":" Sparse matrix-vector products (SpMVs) are a bottleneck in many scientific\ncodes. Due to the heavy strain on the main memory interface from loading the\nsparse matrix and the possibly irregular memory access pattern, SpMV typically\nexhibits low arithmetic intensity. Repeating these products multiple times with\nthe same matrix is required in many algorithms. This so-called matrix power\nkernel (MPK) provides an opportunity for data reuse since the same matrix data\nis loaded from main memory multiple times, an opportunity that has only\nrecently been exploited successfully with the Recursive Algebraic Coloring\nEngine (RACE). Using RACE, one considers a graph based formulation of the SpMV\nand employs s level-based implementation of SpMV for reuse of relevant matrix\ndata. However, the underlying data dependencies have restricted the use of this\nconcept to shared memory parallelization and thus to single compute nodes.\nEnabling cache blocking for distributed-memory parallelization of MPK is\nchallenging due to the need for explicit communication and synchronization of\ndata in neighboring levels. In this work, we propose and implement a flexible\nmethod that interleaves the cache-blocking capabilities of RACE with an MPI\ncommunication scheme that fulfills all data dependencies among processes.\nCompared to a \"traditional\" distributed memory parallel MPK, our new\nDistributed Level-Blocked MPK yields substantial speed-ups on modern Intel and\nAMD architectures across a wide range of sparse matrices from various\nscientific applications. Finally, we address a modern quantum physics problem\nto demonstrate the applicability of our method, achieving a speed-up of up to\n4x on 832 cores of an Intel Sapphire Rapids cluster.\n","authors":["Dane C. Lacey","Christie L. Alappat","Florian Lange","Georg Hager","Holger Fehske","Gerhard Wellein"],"pdf_url":"https://arxiv.org/pdf/2405.12525v1.pdf","comment":"14 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2405.12520v1","updated":"2024-05-21T06:16:42Z","published":"2024-05-21T06:16:42Z","title":"MOSS: A Large-scale Open Microscopic Traffic Simulation System","summary":" In the research of Intelligent Transportation Systems (ITS), traffic\nsimulation is a key procedure for the evaluation of new methods and\noptimization of strategies. However, existing traffic simulation systems face\ntwo challenges. First, how to balance simulation scale with realism is a\ndilemma. Second, it is hard to simulate realistic results, which requires\nrealistic travel demand data and simulator. These problems limit computer-aided\noptimization of traffic management strategies for large-scale road networks and\nreduce the usability of traffic simulations in areas where real-world travel\ndemand data are lacking. To address these problems, we design and implement\nMObility Simulation System (MOSS). MOSS adopts GPU acceleration to\nsignificantly improve the efficiency and scale of microscopic traffic\nsimulation, which enables realistic and fast simulations for large-scale road\nnetworks. It provides realistic travel Origin-Destination (OD) matrices\ngeneration through a pre-trained generative neural network model based on\npublicly available data on a global scale, such as satellite imagery, to help\nresearchers build meaningful travel demand data. It also provides a complete\nopen toolchain to help users with road network construction, demand generation,\nsimulation, and result analysis. The whole toolchain including the simulator\ncan be accessed at https://moss.fiblab.net and the codes are open-source for\ncommunity collaboration.\n","authors":["Jun Zhang","Wenxuan Ao","Junbo Yan","Can Rong","Depeng Jin","Wei Wu","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2405.12520v1.pdf","comment":"Submitted to IEEE ITSC 2024"},{"id":"http://arxiv.org/abs/2402.08950v3","updated":"2024-05-21T04:44:31Z","published":"2024-02-14T05:35:03Z","title":"Taking GPU Programming Models to Task for Performance Portability","summary":" Portability is critical to ensuring high productivity in developing and\nmaintaining scientific software as the diversity in on-node hardware\narchitectures increases. While several programming models provide portability\nfor diverse GPU platforms, they don't make any guarantees about performance\nportability. In this work, we explore several programming models -- CUDA, HIP,\nKokkos, RAJA, OpenMP, OpenACC, and SYCL, to study if the performance of these\nmodels is consistently good across NVIDIA and AMD GPUs. We use five proxy\napplications from different scientific domains, create implementations where\nmissing, and use them to present a comprehensive comparative evaluation of the\nprogramming models. We provide a Spack scripting-based methodology to ensure\nreproducibility of experiments conducted in this work. Finally, we attempt to\nanswer the question -- to what extent does each programming model provide\nperformance portability for heterogeneous systems in real-world usage?\n","authors":["Joshua H. Davis","Pranav Sivaraman","Joy Kitson","Konstantinos Parasyris","Harshitha Menon","Isaac Minn","Giorgis Georgakoudis","Abhinav Bhatele"],"pdf_url":"https://arxiv.org/pdf/2402.08950v3.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2101.09337v5","updated":"2024-05-21T04:29:16Z","published":"2021-01-22T21:14:25Z","title":"Approximate Byzantine Fault-Tolerance in Distributed Optimization","summary":" This paper considers the problem of Byzantine fault-tolerance in distributed\nmulti-agent optimization. In this problem, each agent has a local cost\nfunction, and in the fault-free case, the goal is to design a distributed\nalgorithm that allows all the agents to find a minimum point of all the agents'\naggregate cost function. We consider a scenario where some agents might be\nByzantine faulty that renders the original goal of computing a minimum point of\nall the agents' aggregate cost vacuous. A more reasonable objective for an\nalgorithm in this scenario is to allow all the non-faulty agents to compute the\nminimum point of only the non-faulty agents' aggregate cost. Prior work shows\nthat if there are up to $f$ (out of $n$) Byzantine agents then a minimum point\nof the non-faulty agents' aggregate cost can be computed exactly if and only if\nthe non-faulty agents' costs satisfy a certain redundancy property called\n$2f$-redundancy. However, $2f$-redundancy is an ideal property that can be\nsatisfied only in systems free from noise or uncertainties, which can make the\ngoal of exact fault-tolerance unachievable in some applications. Thus, we\nintroduce the notion of $(f,\\epsilon)$-resilience, a generalization of exact\nfault-tolerance wherein the objective is to find an approximate minimum point\nof the non-faulty aggregate cost, with $\\epsilon$ accuracy. This approximate\nfault-tolerance can be achieved under a weaker condition that is easier to\nsatisfy in practice, compared to $2f$-redundancy. We obtain necessary and\nsufficient conditions for achieving $(f,\\epsilon)$-resilience characterizing\nthe correlation between relaxation in redundancy and approximation in\nresilience. In case when the agents' cost functions are differentiable, we\nobtain conditions for $(f,\\epsilon)$-resilience of the distributed\ngradient-descent method when equipped with robust gradient aggregation.\n","authors":["Shuo Liu","Nirupam Gupta","Nitin H. Vaidya"],"pdf_url":"https://arxiv.org/pdf/2101.09337v5.pdf","comment":"43 pages, 5 figures, and 1 table. The report is an important\n extension to prior work https://dl.acm.org/doi/abs/10.1145/3382734.3405748,\n and arXiv:2003.09675; Added an alternative result with a better analysis"},{"id":"http://arxiv.org/abs/2405.12431v1","updated":"2024-05-21T01:14:33Z","published":"2024-05-21T01:14:33Z","title":"Data Sharing at the Edge of the Network: A Disturbance Resilient\n Multi-modal ITS","summary":" Mobility-as-a-Service (MaaS) is a paradigm that encourages the shift from\nprivate cars to more sustainable alternative mobility services. MaaS provides\nservices that enhances and enables multiple modes of transport to operate\nseamlessly and bringing Multimodal Intelligent Transport Systems (M-ITS) closer\nto reality. This requires sharing and integration of data collected from\nmultiple sources including modes of transports, sensors, and end-users' devices\nto allow a seamless and integrated services especially during unprecedented\ndisturbances. This paper discusses the interactions among transportation modes,\nnetworks, potential disturbance scenarios, and adaptation strategies to\nmitigate their impact on MaaS. We particularly discuss the need to share data\nbetween the modes of transport and relevant entities that are at the vicinity\nof each other, taking advantage of edge computing technology to avoid any\nlatency due to communication to the cloud and privacy concerns. However, when\nsharing at the edge, bandwidth, storage, and computational limitations must be\nconsidered.\n","authors":["Igor Mikolasek","Saeedeh Ghanadbashi","Nima Afraz","Fatemeh Golpayegani"],"pdf_url":"https://arxiv.org/pdf/2405.12431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16314v2","updated":"2024-05-21T22:39:13Z","published":"2024-04-25T03:44:45Z","title":"Parallel and (Nearly) Work-Efficient Dynamic Programming","summary":" The idea of dynamic programming (DP), proposed by Bellman in the 1950s, is\none of the most important algorithmic techniques. However, in parallel, many\nfundamental and sequentially simple problems become more challenging, and open\nto a (nearly) work-efficient solution (i.e., the work is off by at most a\npolylogarithmic factor over the best sequential solution). In fact, sequential\nDP algorithms employ many advanced optimizations such as decision monotonicity\nor special data structures, and achieve better work than straightforward\nsolutions. Many such optimizations are inherently sequential, which creates\nextra challenges for a parallel algorithm to achieve the same work bound.\n The goal of this paper is to achieve (nearly) work-efficient parallel DP\nalgorithms by parallelizing classic, highly-optimized and practical sequential\nalgorithms. We show a general framework called the Cordon Algorithm for\nparallel DP algorithms, and use it to solve several classic problems. Our\nselection of problems includes Longest Increasing Subsequence (LIS), sparse\nLongest Common Subsequence (LCS), convex/concave generalized Least Weight\nSubsequence (LWS), Optimal Alphabetic Tree (OAT), and more. We show how the\nCordon Algorithm can be used to achieve the same level of optimization as the\nsequential algorithms, and achieve good parallelism. Many of our algorithms are\nconceptually simple, and we show some experimental results as\nproofs-of-concept.\n","authors":["Xiangyun Ding","Yan Gu","Yihan Sun"],"pdf_url":"https://arxiv.org/pdf/2404.16314v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02576v2","updated":"2024-05-21T20:16:41Z","published":"2024-02-04T18:05:02Z","title":"Exploring the Design Space for Message-Driven Systems for Dynamic Graph\n Processing using CCA","summary":" Computer systems that have been successfully deployed for dense regular\nworkloads fall short of achieving scalability and efficiency when applied to\nirregular and dynamic graph applications. Conventional computing systems rely\nheavily on static, regular, numeric intensive computations while High\nPerformance Computing systems executing parallel graph applications exhibit\nlittle locality, spatial or temporal, and are fine-grained and memory\nintensive. With the strong interest in AI which depend on these very different\nuse cases combined with the end of Moore's Law at nanoscale, dramatic\nalternatives in architecture and underlying execution models are required. This\npaper identifies an innovative non-von Neumann architecture, Continuum Computer\nArchitecture (CCA), that redefines the nature of computing structures to yield\npowerful innovations in computational methods to deliver a new generation of\nhighly parallel hardware architecture. CCA reflects a genus of highly parallel\narchitectures that while varying in specific quantities (e.g., memory blocks),\nshare a multiple of attributes not found in typical von Neumann machines. Among\nthese are memory-centric components, message-driven asynchronous flow control,\nand lightweight out-of-order execution across a global name space. Together\nthese innovative non-von Neumann architectural properties guided by a new\noriginal execution model will deliver the new future path for extending beyond\nthe von Neumann model. This paper documents a series of interrelated\nexperiments that together establish future directions for next generation\nnon-von Neumann architectures, especially for graph processing.\n","authors":["Bibrak Qamar Chandio","Maciej Brodowicz","Thomas Sterling"],"pdf_url":"https://arxiv.org/pdf/2402.02576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13090v1","updated":"2024-05-21T11:44:07Z","published":"2024-05-21T11:44:07Z","title":"FedASTA: Federated adaptive spatial-temporal attention for traffic flow\n prediction","summary":" Mobile devices and the Internet of Things (IoT) devices nowadays generate a\nlarge amount of heterogeneous spatial-temporal data. It remains a challenging\nproblem to model the spatial-temporal dynamics under privacy concern. Federated\nlearning (FL) has been proposed as a framework to enable model training across\ndistributed devices without sharing original data which reduce privacy concern.\nPersonalized federated learning (PFL) methods further address data heterogenous\nproblem. However, these methods don't consider natural spatial relations among\nnodes. For the sake of modeling spatial relations, Graph Neural Netowork (GNN)\nbased FL approach have been proposed. But dynamic spatial-temporal relations\namong edge nodes are not taken into account. Several approaches model\nspatial-temporal dynamics in a centralized environment, while less effort has\nbeen made under federated setting. To overcome these challeges, we propose a\nnovel Federated Adaptive Spatial-Temporal Attention (FedASTA) framework to\nmodel the dynamic spatial-temporal relations. On the client node, FedASTA\nextracts temporal relations and trend patterns from the decomposed terms of\noriginal time series. Then, on the server node, FedASTA utilize trend patterns\nfrom clients to construct adaptive temporal-spatial aware graph which captures\ndynamic correlation between clients. Besides, we design a masked spatial\nattention module with both static graph and constructed adaptive graph to model\nspatial dependencies among clients. Extensive experiments on five real-world\npublic traffic flow datasets demonstrate that our method achieves state-of-art\nperformance in federated scenario. In addition, the experiments made in\ncentralized setting show the effectiveness of our novel adaptive graph\nconstruction approach compared with other popular dynamic spatial-temporal\naware methods.\n","authors":["Kaiyuan Li","Yihan Zhang","Xinlei Chen"],"pdf_url":"https://arxiv.org/pdf/2405.13090v1.pdf","comment":null}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2405.12034v2","updated":"2024-05-21T07:52:37Z","published":"2024-05-20T14:01:38Z","title":"Count-Min Sketch with Conservative Updates: Worst-Case Analysis","summary":" Count-Min Sketch with Conservative Updates (CMS-CU) is a memory-efficient\nhash-based data structure used to estimate the occurrences of items within a\ndata stream. CMS-CU stores $m$ counters and employs $d$ hash functions to map\nitems to these counters. We first argue that the estimation error in CMS-CU is\nmaximal when each item appears at most once in the stream. Next, we study\nCMS-CU in this setting. In the case where $d=m-1$, we prove that the average\nestimation error and the average counter rate converge almost surely to\n$\\frac{1}{2}$, contrasting with the vanilla Count-Min Sketch, where the average\ncounter rate is equal to $\\frac{m-1}{m}$. For any given $m$ and $d$, we prove\nnovel lower and upper bounds on the average estimation error, incorporating a\npositive integer parameter $g$. Larger values of this parameter improve the\naccuracy of the bounds. Moreover, the computation of each bound involves\nexamining an ergodic Markov process with a state space of size\n$\\binom{m+g-d}{g}$ and a sparse transition probabilities matrix containing\n$\\mathcal{O}(m\\binom{m+g-d}{g})$ non-zero entries. For $d=m-1$, $g=1$, and as\n$m\\to \\infty$, we show that the lower and upper bounds coincide. In general,\nour bounds exhibit high accuracy for small values of $g$, as shown by numerical\ncomputation. For example, for $m=50$, $d=4$, and $g=5$, the difference between\nthe lower and upper bounds is smaller than $10^{-4}$.\n","authors":["Younes Ben Mazziane","Othmane Marfoq"],"pdf_url":"https://arxiv.org/pdf/2405.12034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08950v3","updated":"2024-05-21T04:44:31Z","published":"2024-02-14T05:35:03Z","title":"Taking GPU Programming Models to Task for Performance Portability","summary":" Portability is critical to ensuring high productivity in developing and\nmaintaining scientific software as the diversity in on-node hardware\narchitectures increases. While several programming models provide portability\nfor diverse GPU platforms, they don't make any guarantees about performance\nportability. In this work, we explore several programming models -- CUDA, HIP,\nKokkos, RAJA, OpenMP, OpenACC, and SYCL, to study if the performance of these\nmodels is consistently good across NVIDIA and AMD GPUs. We use five proxy\napplications from different scientific domains, create implementations where\nmissing, and use them to present a comprehensive comparative evaluation of the\nprogramming models. We provide a Spack scripting-based methodology to ensure\nreproducibility of experiments conducted in this work. Finally, we attempt to\nanswer the question -- to what extent does each programming model provide\nperformance portability for heterogeneous systems in real-world usage?\n","authors":["Joshua H. Davis","Pranav Sivaraman","Joy Kitson","Konstantinos Parasyris","Harshitha Menon","Isaac Minn","Giorgis Georgakoudis","Abhinav Bhatele"],"pdf_url":"https://arxiv.org/pdf/2402.08950v3.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.12525v1","updated":"2024-05-21T06:30:00Z","published":"2024-05-21T06:30:00Z","title":"Cache Blocking of Distributed-Memory Parallel Matrix Power Kernels","summary":" Sparse matrix-vector products (SpMVs) are a bottleneck in many scientific\ncodes. Due to the heavy strain on the main memory interface from loading the\nsparse matrix and the possibly irregular memory access pattern, SpMV typically\nexhibits low arithmetic intensity. Repeating these products multiple times with\nthe same matrix is required in many algorithms. This so-called matrix power\nkernel (MPK) provides an opportunity for data reuse since the same matrix data\nis loaded from main memory multiple times, an opportunity that has only\nrecently been exploited successfully with the Recursive Algebraic Coloring\nEngine (RACE). Using RACE, one considers a graph based formulation of the SpMV\nand employs s level-based implementation of SpMV for reuse of relevant matrix\ndata. However, the underlying data dependencies have restricted the use of this\nconcept to shared memory parallelization and thus to single compute nodes.\nEnabling cache blocking for distributed-memory parallelization of MPK is\nchallenging due to the need for explicit communication and synchronization of\ndata in neighboring levels. In this work, we propose and implement a flexible\nmethod that interleaves the cache-blocking capabilities of RACE with an MPI\ncommunication scheme that fulfills all data dependencies among processes.\nCompared to a \"traditional\" distributed memory parallel MPK, our new\nDistributed Level-Blocked MPK yields substantial speed-ups on modern Intel and\nAMD architectures across a wide range of sparse matrices from various\nscientific applications. Finally, we address a modern quantum physics problem\nto demonstrate the applicability of our method, achieving a speed-up of up to\n4x on 832 cores of an Intel Sapphire Rapids cluster.\n","authors":["Dane C. Lacey","Christie L. Alappat","Florian Lange","Georg Hager","Holger Fehske","Gerhard Wellein"],"pdf_url":"https://arxiv.org/pdf/2405.12525v1.pdf","comment":"14 pages, 12 figures, 5 tables"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2405.07724v2","updated":"2024-05-21T14:27:17Z","published":"2024-05-13T13:19:50Z","title":"Monoidal closure of Grothendieck constructions via $Σ$-tractable\n monoidal structures and Dialectica formulas","summary":" We study the categorical structure of the Grothendieck construction of an\nindexed category $\\mathcal{L}:\\mathcal{C}^{op}\\to\\mathbf{CAT}$ and characterise\nfibred limits, colimits, and monoidal structures. Next, we give sufficient\nconditions for the monoidal closure of the total category $\\Sigma_\\mathcal{C}\n\\mathcal{L}$ of a Grothendieck construction of an indexed category\n$\\mathcal{L}:\\mathcal{C}^{op}\\to\\mathbf{CAT}$. Our analysis is a generalization\nof G\\\"odel's Dialectica interpretation, and it relies on a novel notion of\n$\\Sigma$-tractable monoidal structure. As we will see, $\\Sigma$-tractable\ncoproducts simultaneously generalize cocartesian coclosed structures,\nbiproducts and extensive coproducts. We analyse when the closed structure is\nfibred -- usually it is not.\n","authors":["Fernando Lucatelli Nunes","Matthijs Vákár"],"pdf_url":"https://arxiv.org/pdf/2405.07724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12976v1","updated":"2024-05-21T17:57:33Z","published":"2024-05-21T17:57:33Z","title":"A Sound Type System for Secure Currency Flow","summary":" In this paper we focus on TinySol, a minimal calculus for Solidity smart\ncontracts, introduced by Bartoletti et al. We start by rephrasing its syntax\n(to emphasise its object-oriented flavour) and give a new big-step operational\nsemantics. We then use it to define two security properties, namely call\nintegrity and noninterference. These two properties have some similarities in\ntheir definition, in that they both require that some part of a program is not\ninfluenced by the other part. However, we show that the two properties are\nactually incomparable. Nevertheless, we provide a type system for\nnoninterference and show that well-typed programs satisfy call integrity as\nwell; hence, programs that are accepted by our type system satisfy both\nproperties. We finally discuss the practical usability of the type system and\nits limitations by means of some simple examples.\n","authors":["Luca Aceto","Daniele Gorla","Stian Lybech"],"pdf_url":"https://arxiv.org/pdf/2405.12976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12841v1","updated":"2024-05-21T14:46:55Z","published":"2024-05-21T14:46:55Z","title":"Unveiling the Power of Intermediate Representations for Static Analysis:\n A Survey","summary":" Static analysis techniques enhance the security, performance, and reliability\nof programs by analyzing and portraiting program behaviors without the need for\nactual execution. In essence, static analysis takes the Intermediate\nRepresentation (IR) of a target program as input to retrieve essential program\ninformation and understand the program. However, there is a lack of systematic\nanalysis on the benefit of IR for static analysis, besides serving as an\ninformation provider. In general, a modern static analysis framework should\npossess the ability to conduct diverse analyses on different languages,\nproducing reliable results with minimal time consumption, and offering\nextensive customization options. In this survey, we systematically characterize\nthese goals and review the potential solutions from the perspective of IR. It\ncan serve as a manual for learners and practitioners in the static analysis\nfield to better understand IR design. Meanwhile, numerous research\nopportunities are revealed for researchers.\n","authors":["Bowen Zhang","Wei Chen","Hung-Chun Chiu","Charles Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.12841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12699v1","updated":"2024-05-21T11:46:51Z","published":"2024-05-21T11:46:51Z","title":"GeckoGraph: A Visual Language for Polymorphic Types","summary":" Polymorphic types are an important feature in most strongly typed programming\nlanguages. They allow functions to be written in a way that can be used with\ndifferent data types, while still enforcing the relationship and constraints\nbetween the values. However, programmers often find polymorphic types difficult\nto use and understand and tend to reason using concrete types. We propose\nGeckoGraph, a graphical notation for types. GeckoGraph aims to accompany\ntraditional text-based type notation and to make reading, understanding, and\ncomparing types easier. We conducted a large-scale human study using GeckoGraph\ncompared to text-based type notation. To our knowledge, this is the largest\ncontrolled user study on functional programming ever conducted. The results of\nthe study show that GeckoGraph helps improve programmers' ability to succeed in\nthe programming tasks we designed, especially for novice programmers.\n","authors":["Shuai Fu","Tim Dwyer","Peter J. Stuckey"],"pdf_url":"https://arxiv.org/pdf/2405.12699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12697v1","updated":"2024-05-21T11:40:39Z","published":"2024-05-21T11:40:39Z","title":"Goanna: Resolving Haskell Type Errors With Minimal Correction Subsets","summary":" Statically typed languages offer significant advantages, such as bug\nprevention, enhanced code quality, and reduced maintenance costs. However,\nthese benefits often come at the expense of a steep learning curve and a slower\ndevelopment pace. Haskell, known for its expressive and strict type system,\nposes challenges for inexperienced programmers in learning and using its type\nsystem, especially in debugging type errors. We introduce Goanna, a novel tool\nthat serves as a type checker and an interactive type error debugging tool for\nHaskell. When encountering type errors, Goanna identifies a comprehensive list\nof potential causes and resolutions based on the minimum correction subsets\n(MCS) enumeration. We evaluated Goanna's effectiveness using 86 diverse Haskell\nprograms from online discourse, demonstrating its ability to accurately\nidentify and resolve type errors. Additionally, we present a collection of\ntechniques and heuristics to enhance Goanna's suggestion-based error diagnosis\nand show their effectiveness from our evaluation.\n","authors":["Shuai Fu","Tim Dwyer","Peter J. Stuckey","John Grundy"],"pdf_url":"https://arxiv.org/pdf/2405.12697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12586v1","updated":"2024-05-21T08:31:09Z","published":"2024-05-21T08:31:09Z","title":"Reduction Strategies in the Lambda Calculus and Their Implementation\n through Derivable Abstract Machines: Introduction","summary":" The lambda calculus since more than half a century is a model and foundation\nof functional programming languages. However, lambda expressions can be\nevaluated with different reduction strategies and thus, there is no fixed cost\nmodel nor one canonical implementation for all applications of the lambda\ncalculus.\n This article is an introduction to a dissertation is composed of four\nconference papers where: we present a systematic survey of reduction strategies\nof the lambda calculus; we take advantage of the functional correspondence as a\ntool for studying implementations of the lambda calculus by deriving an\nabstract machine for a precisely identified strong call-by-value reduction\nstrategy; we improve it to obtain an efficient abstract machine for strong call\nby value and provide a time complexity analysis for the new machine with the\nuse of a potential function; and we present the first provably efficient\nabstract machine for strong call by need.\n","authors":["Tomasz Drab"],"pdf_url":"https://arxiv.org/pdf/2405.12586v1.pdf","comment":"37 pages, 12 figures, 2 tables, 4 code listings"},{"id":"http://arxiv.org/abs/2405.12513v1","updated":"2024-05-21T05:54:27Z","published":"2024-05-21T05:54:27Z","title":"Fully Randomized Pointers","summary":" Software security continues to be a critical concern for programs implemented\nin low-level programming languages such as C and C++. Many defenses have been\nproposed in the current literature, each with different trade-offs including\nperformance, compatibility, and attack resistance. One general class of defense\nis pointer randomization or authentication, where invalid object access (e.g.,\nmemory errors) is obfuscated or denied. Many defenses rely on the program\ntermination (e.g., crashing) to abort attacks, with the implicit assumption\nthat an adversary cannot \"brute force\" the defense with multiple attack\nattempts. However, such assumptions do not always hold, such as hardware\nspeculative execution attacks or network servers configured to restart on\nerror. In such cases, we argue that most existing defenses provide only weak\neffective security.\n In this paper, we propose Fully Randomized Pointers (FRP) as a stronger\nmemory error defense that is resistant to even brute force attacks. The key\nidea is to fully randomize pointer bits -- as much as possible while also\npreserving binary compatibility -- rendering the relationships between pointers\nhighly unpredictable. Furthermore, the very high degree of randomization\nrenders brute force attacks impractical -- providing strong effective security\ncompared to existing work. We design a new FRP encoding that is: (1) compatible\nwith existing binary code (without recompilation); (2) decoupled from the\nunderlying object layout; and (3) can be efficiently decoded on-the-fly to the\nunderlying memory address. We prototype FRP in the form of a software\nimplementation (BlueFat) to test security and compatibility, and a\nproof-of-concept hardware implementation (GreenFat) to evaluate performance. We\nshow that FRP is secure, practical, and compatible at the binary level, while a\nhardware implementation can achieve low performance overheads (<10%).\n","authors":["Gregory J. Duck","Sai Dhawal Phaye","Roland H. C. Yap","Trevor E. Carlson"],"pdf_url":"https://arxiv.org/pdf/2405.12513v1.pdf","comment":"24 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.12507v1","updated":"2024-05-21T05:34:34Z","published":"2024-05-21T05:34:34Z","title":"Compiler support for semi-manual AoS-to-SoA conversions with data views","summary":" The C programming language and its cousins such as C++ stipulate the static\nstorage of sets of structured data: Developers have to commit to one, invariant\ndata model -- typically a structure-of-arrays (SoA) or an array-of-structs\n(AoS) -- unless they manually rearrange, i.e.~convert it throughout the\ncomputation. Whether AoS or SoA is favourable depends on the execution context\nand algorithm step. We propose a language extension based upon C++ attributes\nthrough which developers can guide the compiler what memory arrangements are to\nbe used. The compiler can then automatically convert (parts of) the data into\nthe format of choice prior to a calculation and convert results back\nafterwards. As all conversions are merely annotations, it is straightforward\nfor the developer to experiment with different storage formats and to pick\nsubsets of data that are subject to memory rearrangements. Our work implements\nthe annotations within Clang and demonstrates their potential impact through a\nsmoothed particle hydrodynamics (SPH) code.\n","authors":["Pawel K. Radtke","Tobias Weinzierl"],"pdf_url":"https://arxiv.org/pdf/2405.12507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06907v2","updated":"2024-05-21T20:35:55Z","published":"2024-05-11T04:29:03Z","title":"AIOS Compiler: LLM as Interpreter for Natural Language Programming and\n Flow Programming of AI Agents","summary":" Since their inception, programming languages have trended towards greater\nreadability and lower barriers for programmers. Following this trend, natural\nlanguage can be a promising type of programming language that provides great\nflexibility and usability and helps towards the democracy of programming.\nHowever, the inherent vagueness, ambiguity, and verbosity of natural language\npose significant challenges in developing an interpreter that can accurately\nunderstand the programming logic and execute instructions written in natural\nlanguage. Fortunately, recent advancements in Large Language Models (LLMs) have\ndemonstrated remarkable proficiency in interpreting complex natural language.\nInspired by this, we develop a novel system for Code Representation and\nExecution (CoRE), which employs LLM as interpreter to interpret and execute\nnatural language instructions. The proposed system unifies natural language\nprogramming, pseudo-code programming, and flow programming under the same\nrepresentation for constructing language agents, while LLM serves as the\ninterpreter to interpret and execute the agent programs. In this paper, we\nbegin with defining the programming syntax that structures natural language\ninstructions logically. During the execution, we incorporate external memory to\nminimize redundancy. Furthermore, we equip the designed interpreter with the\ncapability to invoke external tools, compensating for the limitations of LLM in\nspecialized domains or when accessing real-time information. This work is\nopen-source at https://github.com/agiresearch/CoRE,\nhttps://github.com/agiresearch/OpenAGI, and\nhttps://github.com/agiresearch/AIOS.\n","authors":["Shuyuan Xu","Zelong Li","Kai Mei","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.06907v2.pdf","comment":"12 pages, 6 figures, comments and suggestions are welcome"}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2405.11155v2","updated":"2024-05-21T13:14:06Z","published":"2024-05-18T02:59:32Z","title":"Inner-approximate Reachability Computation via Zonotopic Boundary\n Analysis","summary":" Inner-approximate reachability analysis involves calculating subsets of\nreachable sets, known as inner-approximations. This analysis is crucial in the\nfields of dynamic systems analysis and control theory as it provides a reliable\nestimation of the set of states that a system can reach from given initial\nstates at a specific time instant. In this paper, we study the\ninner-approximate reachability analysis problem based on the set-boundary\nreachability method for systems modelled by ordinary differential equations, in\nwhich the computed inner-approximations are represented with zonotopes. The\nset-boundary reachability method computes an inner-approximation by excluding\nstates reached from the initial set's boundary. The effectiveness of this\nmethod is highly dependent on the efficient extraction of the exact boundary of\nthe initial set. To address this, we propose methods leveraging boundary and\ntiling matrices that can efficiently extract and refine the exact boundary of\nthe initial set represented by zonotopes. Additionally, we enhance the\nexclusion strategy by contracting the outer-approximations in a flexible way,\nwhich allows for the computation of less conservative inner-approximations. To\nevaluate the proposed method, we compare it with state-of-the-art methods\nagainst a series of benchmarks. The numerical results demonstrate that our\nmethod is not only efficient but also accurate in computing\ninner-approximations.\n","authors":["Dejin Ren","Zhen Liang","Chenyu Wu","Jianqiang Ding","Taoran Wu","Bai Xue"],"pdf_url":"https://arxiv.org/pdf/2405.11155v2.pdf","comment":"the extended version of the paper accepted by CAV 2024"},{"id":"http://arxiv.org/abs/2405.12583v1","updated":"2024-05-21T08:27:21Z","published":"2024-05-21T08:27:21Z","title":"Ergodic Unobservable MDPs: Decidability of Approximation","summary":" Unobservable Markov decision processes (UMDPs) serve as a prominent\nmathematical framework for modeling sequential decision-making problems. A key\naspect in computational analysis is the consideration of decidability, which\nconcerns the existence of algorithms. In general, the computation of the exact\nand approximated values is undecidable for UMDPs with the long-run average\nobjective. Building on matrix product theory and ergodic properties, we\nintroduce a novel subclass of UMDPs, termed ergodic UMDPs. Our main result\ndemonstrates that approximating the value within this subclass is decidable.\nHowever, we show that the exact problem remains undecidable. Finally, we\ndiscuss the primary challenges of extending these results to partially\nobservable Markov decision processes.\n","authors":["Krishnendu Chatterjee","David Lurie","Raimundo Saona","Bruno Ziliotto"],"pdf_url":"https://arxiv.org/pdf/2405.12583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12555v1","updated":"2024-05-21T07:50:38Z","published":"2024-05-21T07:50:38Z","title":"A Subexponential Reduction from Product Partition to Subset Sum","summary":" In this paper we study the Product Partition Problem (PPP), i.e. we are given\na set of $n$ natural numbers represented on $m$ bits each and we are asked if a\nsubset exists such that the product of the numbers in the subset equals the\nproduct of the numbers not in the subset. Our approach is to obtain the integer\nfactorization of each number. This is the subexponential step. We then form a\nmatrix with the exponents of the primes and show that the PPP has a solution\niff some Subset Sum Problems have a common solution. Finally, using the fact\nthat the exponents are not large we combine all the Subset Sum Problems in a\nsingle Subset Sum Problem (SSP) and show that its size is polynomial in $m,n$.\nWe show that the PPP has a solution iff the final SSP has one.\n","authors":["Marius Costandin"],"pdf_url":"https://arxiv.org/pdf/2405.12555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07683v2","updated":"2024-05-21T06:13:20Z","published":"2024-02-12T14:43:40Z","title":"Two Choices are Enough for P-LCPs, USOs, and Colorful Tangents","summary":" We provide polynomial-time reductions between three search problems from\nthree distinct areas: the P-matrix linear complementarity problem (P-LCP),\nfinding the sink of a unique sink orientation (USO), and a variant of the\n$\\alpha$-Ham Sandwich problem. For all three settings, we show that \"two\nchoices are enough\", meaning that the general non-binary version of the problem\ncan be reduced in polynomial time to the binary version. This specifically\nmeans that generalized P-LCPs are equivalent to P-LCPs, and grid USOs are\nequivalent to cube USOs. These results are obtained by showing that both the\nP-LCP and our $\\alpha$-Ham Sandwich variant are equivalent to a new problem we\nintroduce, P-Lin-Bellman. This problem can be seen as a new tool for\nformulating problems as P-LCPs.\n","authors":["Michaela Borzechowski","John Fearnley","Spencer Gordon","Rahul Savani","Patrick Schnider","Simon Weber"],"pdf_url":"https://arxiv.org/pdf/2402.07683v2.pdf","comment":"29 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.16344v2","updated":"2024-05-21T00:15:20Z","published":"2023-10-25T03:59:08Z","title":"Baby PIH: Parameterized Inapproximability of Min CSP","summary":" The Parameterized Inapproximability Hypothesis (PIH) is the analog of the PCP\ntheorem in the world of parameterized complexity. It asserts that no FPT\nalgorithm can distinguish a satisfiable 2CSP instance from one which is only\n$(1-\\varepsilon)$-satisfiable (where the parameter is the number of variables)\nfor some constant $0<\\varepsilon<1$.\n We consider a minimization version of CSPs (Min-CSP), where one may assign\n$r$ values to each variable, and the goal is to ensure that every constraint is\nsatisfied by some choice among the $r \\times r$ pairs of values assigned to its\nvariables (call such a CSP instance $r$-list-satisfiable). We prove the\nfollowing strong parameterized inapproximability for Min CSP: For every $r \\ge\n1$, it is W[1]-hard to tell if a 2CSP instance is satisfiable or is not even\n$r$-list-satisfiable. We refer to this statement as \"Baby PIH\", following the\nrecently proved Baby PCP Theorem (Barto and Kozik, 2021). Our proof adapts the\ncombinatorial arguments underlying the Baby PCP theorem, overcoming some basic\nobstacles that arise in the parameterized setting. Furthermore, our reduction\nruns in time polynomially bounded in both the number of variables and the\nalphabet size, and thus implies the Baby PCP theorem as well.\n","authors":["Venkatesan Guruswami","Xuandi Ren","Sai Sandeep"],"pdf_url":"https://arxiv.org/pdf/2310.16344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17219v4","updated":"2024-05-21T00:09:25Z","published":"2024-01-30T18:05:11Z","title":"A criterion for Andrásfai--Erdős--Sós type theorems and\n applications","summary":" The classical Andr\\'{a}sfai--Erd\\H{o}s--S\\'{o}s Theorem states that for\n$\\ell\\ge 2$, every $n$-vertex $K_{\\ell+1}$-free graph with minimum degree\ngreater than $\\frac{3\\ell-4}{3\\ell-1}n$ must be $\\ell$-partite. We establish a\nsimple criterion for $r$-graphs, $r \\geq 2$, to exhibit an\nAndr\\'{a}sfai--Erd\\H{o}s--S\\'{o}s type property, also known as\ndegree-stability. This leads to a classification of most previously studied\nhypergraph families with this property. An immediate application of this\nresult, combined with a general theorem by Keevash--Lenz--Mubayi, solves the\nspectral Tur\\'{a}n problems for a large class of hypergraphs.\n For every $r$-graph $F$ with degree-stability, there is a simple algorithm to\ndecide the $F$-freeness of an $n$-vertex $r$-graph with minimum degree greater\nthan $(\\pi(F) - \\varepsilon_F)\\binom{n}{r-1}$ in time $O(n^r)$, where\n$\\varepsilon_F >0$ is a constant. In particular, for the complete graph\n$K_{\\ell+1}$, we can take $\\varepsilon_{K_{\\ell+1}} = (3\\ell^2-\\ell)^{-1}$, and\nthis bound is tight up to some multiplicative constant factor unless\n$\\mathbf{W[1]} = \\mathbf{FPT}$. Based on a result by Chen--Huang--Kanj--Xia, we\nfurther show that for every fixed $C > 0$, this problem cannot be solved in\ntime $n^{o(\\ell)}$ if we replace $\\varepsilon_{K_{\\ell+1}}$ with $(C\\ell)^{-1}$\nunless $\\mathbf{ETH}$ fails. Furthermore, we apply the degree-stability of\n$K_{\\ell+1}$ to decide the $K_{\\ell+1}$-freeness of graphs whose size is close\nto the Tur\\'{a}n bound in time $(\\ell+1)n^2$, partially improving a recent\nresult by Fomin--Golovach--Sagunov--Simonov. As an intermediate step, we show\nthat for a specific class of $r$-graphs $F$, the (surjective) $F$-coloring\nproblem can be solved in time $O(n^r)$, provided the input $r$-graph has $n$\nvertices and a large minimum degree, refining several previous results.\n","authors":["Jianfeng Hou","Xizhi Liu","Hongbin Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.17219v4.pdf","comment":"fixed some typos, changed the title, reorganized to enhance\n readability for combinatorial readers, comments are welcome"},{"id":"http://arxiv.org/abs/2405.13143v1","updated":"2024-05-21T18:26:42Z","published":"2024-05-21T18:26:42Z","title":"Pseudorandomness, symmetry, smoothing: I","summary":" We prove several new results about bounded uniform and small-bias\ndistributions. A main message is that, small-bias, even perturbed with noise,\ndoes not fool several classes of tests better than bounded uniformity. We prove\nthis for threshold tests, small-space algorithms, and small-depth circuits. In\nparticular, we obtain small-bias distributions that\n 1) achieve an optimal lower bound on their statistical distance to any\nbounded-uniform distribution. This closes a line of research initiated by Alon,\nGoldreich, and Mansour in 2003, and improves on a result by O'Donnell and Zhao.\n 2) have heavier tail mass than the uniform distribution. This answers a\nquestion posed by several researchers including Bun and Steinke.\n 3) rule out a popular paradigm for constructing pseudorandom generators,\noriginating in a 1989 work by Ajtai and Wigderson. This again answers a\nquestion raised by several researchers. For branching programs, our result\nmatches a bound by Forbes and Kelley.\n Our small-bias distributions above are symmetric. We show that the xor of any\ntwo symmetric small-bias distributions fools any bounded function. Hence our\nexamples cannot be extended to the xor of two small-bias distributions, another\npopular paradigm whose power remains unknown. We also generalize and simplify\nthe proof of a result of Bazzi.\n","authors":["Harm Derksen","Peter Ivanov","Chin Ho Lee","Emanuele Viola"],"pdf_url":"https://arxiv.org/pdf/2405.13143v1.pdf","comment":"CCC 2024"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2405.09504v3","updated":"2024-05-21T11:10:31Z","published":"2024-05-15T16:53:30Z","title":"Initial Algebras Unchained -- A Novel Initial Algebra Construction\n Formalized in Agda","summary":" The initial algebra for an endofunctor F provides a recursion and induction\nscheme for data structures whose constructors are described by F. The\ninitial-algebra construction by Ad\\'amek (1974) starts with the initial object\n(e.g. the empty set) and successively applies the functor until a fixed point\nis reached, an idea inspired by Kleene's fixed point theorem. Depending on the\nfunctor of interest, this may require transfinitely many steps indexed by\nordinal numbers until termination.\n We provide a new initial algebra construction which is not based on an\nordinal-indexed chain. Instead, our construction is loosely inspired by\nPataraia's fixed point theorem and forms the colimit of all finite recursive\ncoalgebras. This is reminiscent of the construction of the rational fixed point\nof an endofunctor that forms the colimit of all finite coalgebras. For our main\ncorrectness theorem, we assume the given endofunctor is accessible on a (weak\nform of) locally presentable category. Our proofs are constructive and fully\nformalized in Agda.\n","authors":["Thorsten Wißmann","Stefan Milius"],"pdf_url":"https://arxiv.org/pdf/2405.09504v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07724v2","updated":"2024-05-21T14:27:17Z","published":"2024-05-13T13:19:50Z","title":"Monoidal closure of Grothendieck constructions via $Σ$-tractable\n monoidal structures and Dialectica formulas","summary":" We study the categorical structure of the Grothendieck construction of an\nindexed category $\\mathcal{L}:\\mathcal{C}^{op}\\to\\mathbf{CAT}$ and characterise\nfibred limits, colimits, and monoidal structures. Next, we give sufficient\nconditions for the monoidal closure of the total category $\\Sigma_\\mathcal{C}\n\\mathcal{L}$ of a Grothendieck construction of an indexed category\n$\\mathcal{L}:\\mathcal{C}^{op}\\to\\mathbf{CAT}$. Our analysis is a generalization\nof G\\\"odel's Dialectica interpretation, and it relies on a novel notion of\n$\\Sigma$-tractable monoidal structure. As we will see, $\\Sigma$-tractable\ncoproducts simultaneously generalize cocartesian coclosed structures,\nbiproducts and extensive coproducts. We analyse when the closed structure is\nfibred -- usually it is not.\n","authors":["Fernando Lucatelli Nunes","Matthijs Vákár"],"pdf_url":"https://arxiv.org/pdf/2405.07724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11636v2","updated":"2024-05-21T17:34:45Z","published":"2023-10-18T00:07:38Z","title":"A Uniform Language to Explain Decision Trees","summary":" The formal XAI community has studied a plethora of interpretability queries\naiming to understand the classifications made by decision trees. However, a\nmore uniform understanding of what questions we can hope to answer about these\nmodels, traditionally deemed to be easily interpretable, has remained elusive.\nIn an initial attempt to understand uniform languages for interpretability,\nArenas et al. (2021) proposed FOIL, a logic for explaining black-box ML models,\nand showed that it can express a variety of interpretability queries. However,\nwe show that FOIL is limited in two important senses: (i) it is not expressive\nenough to capture some crucial queries, and (ii) its model agnostic nature\nresults in a high computational complexity for decision trees. In this paper,\nwe carefully craft two fragments of first-order logic that allow for\nefficiently interpreting decision trees: Q-DT-FOIL and its optimization variant\nOPT-DT-FOIL. We show that our proposed logics can express not only a variety of\ninterpretability queries considered by previous literature, but also elegantly\nallows users to specify different objectives the sought explanations should\noptimize for. Using finite model-theoretic techniques, we show that the\ndifferent ingredients of Q-DT-FOIL are necessary for its expressiveness, and\nyet that queries in Q-DT-FOIL can be evaluated with a polynomial number of\nqueries to a SAT solver, as well as their optimization versions in OPT-DT-FOIL.\nBesides our theoretical results, we provide a SAT-based implementation of the\nevaluation for OPT-DT-FOIL that is performant on industry-size decision trees.\n","authors":["Marcelo Arenas","Pablo Barcelo","Diego Bustamante","Jose Caraball","Bernardo Subercaseaux"],"pdf_url":"https://arxiv.org/pdf/2310.11636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00240v6","updated":"2024-05-21T17:30:01Z","published":"2022-10-01T10:24:34Z","title":"Executable First-Order Queries in the Logic of Information Flows","summary":" The logic of information flows (LIF) has recently been proposed as a general\nframework in the field of knowledge representation. In this framework, tasks of\nprocedural nature can still be modeled in a declarative, logic-based fashion.\nIn this paper, we focus on the task of query processing under limited access\npatterns, a well-studied problem in the database literature. We show that LIF\nis well-suited for modeling this task. Toward this goal, we introduce a variant\nof LIF called \"forward\" LIF (FLIF), in a first-order setting. FLIF takes a\nnovel graph-navigational approach; it is an XPath-like language that\nnevertheless turns out to be equivalent to the \"executable\" fragment of\nfirst-order logic defined by Nash and Lud\\\"ascher. One can also classify the\nvariables in FLIF expressions as inputs and outputs. Expressions where inputs\nand outputs are disjoint, referred to as io-disjoint FLIF expressions, allow a\nparticularly transparent translation into algebraic query plans that respect\nthe access limitations. Finally, we show that general FLIF expressions can\nalways be put into io-disjoint form.\n","authors":["Heba Aamer","Bart Bogaerts","Dimitri Surinx","Eugenia Ternovska","Jan Van den Bussche"],"pdf_url":"https://arxiv.org/pdf/2210.00240v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07656v3","updated":"2024-05-21T16:55:58Z","published":"2024-01-15T12:52:56Z","title":"Learning Explainable and Better Performing Representations of POMDP\n Strategies","summary":" Strategies for partially observable Markov decision processes (POMDP)\ntypically require memory. One way to represent this memory is via automata. We\npresent a method to learn an automaton representation of a strategy using a\nmodification of the L*-algorithm. Compared to the tabular representation of a\nstrategy, the resulting automaton is dramatically smaller and thus also more\nexplainable. Moreover, in the learning process, our heuristics may even improve\nthe strategy's performance. In contrast to approaches that synthesize an\nautomaton directly from the POMDP thereby solving it, our approach is\nincomparably more scalable.\n","authors":["Alexander Bork","Debraj Chakraborty","Kush Grover","Jan Kretinsky","Stefanie Mohr"],"pdf_url":"https://arxiv.org/pdf/2401.07656v3.pdf","comment":"Technical report for the submission to TACAS 24"},{"id":"http://arxiv.org/abs/2405.12917v1","updated":"2024-05-21T16:39:26Z","published":"2024-05-21T16:39:26Z","title":"Commutative codensity monads and probability bimeasures","summary":" Several well-studied probability monads have been expressed as codensity\nmonads over small categories of stochastic maps, giving a limit description of\nspaces of probability measures. In this paper we show how properties of\nprobability monads such as commutativity and affineness can arise from their\ncodensity presentation. First we show that their codensity presentation is\nclosely related to another characterisation of probability monads as terminal\nendofunctors admitting certain maps into the Giry monad, which allows us to\ngeneralise a result by Van Breugel on the Kantorovich monad. We then provide\nsufficient conditions for a codensity monad to lift to $\\bf{MonCat}$, and give\na characterisation of exactly pointwise monoidal codensity monads; codensity\nmonads that satisfy a strengthening of these conditions. We show that the Radon\nmonad is exactly pointwise monoidal, and hence give a description of the tensor\nproduct of free algebras of the Radon monad in terms of Day convolution.\nFinally we show that the Giry monad is not exactly pointwise monoidal due to\nthe existence of probability bimeasures that do not extend to measures,\nalthough its restriction to standard Borel spaces is. We introduce the notion\nof a $*$-monad and its Kleisli monoidal op-multicategory to describe the\ncategorical structure that organises the spaces of probability polymeasures on\nmeasurable spaces.\n","authors":["Zev Shirazi"],"pdf_url":"https://arxiv.org/pdf/2405.12917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02845v3","updated":"2024-05-21T16:38:05Z","published":"2023-10-04T14:24:26Z","title":"Note on a Translation from First-Order Logic into the Calculus of\n Relations Preserving Validity and Finite Validity","summary":" In this note, we give a linear-size translation from formulas of first-order\nlogic into equations of the calculus of relations preserving validity and\nfinite validity. Our translation also gives a linear-size conservative\nreduction from formulas of first-order logic into formulas of the\nthree-variable fragment of first-order logic.\n","authors":["Yoshiki Nakamura"],"pdf_url":"https://arxiv.org/pdf/2310.02845v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12882v1","updated":"2024-05-21T15:54:03Z","published":"2024-05-21T15:54:03Z","title":"Centralized vs Decentralized Monitors for Hyperproperties","summary":" This paper focuses on the runtime verification of hyperproperties expressed\nin HypermuHML, an expressive yet simple logic for describing properties of sets\nof traces. To this end, we first consider a simple language of monitors that\ncan observe sets of system executions and report verdicts w.r.t. a given\nHypermuHML formula. In this setting, a unique omniscient monitor observes all\nsystem traces, and, in this sense, it is 'centralized'. However, in a possibly\ndistributed system, having a centralized entity is undesirable; hence, we also\nprovide a language for 'decentralized' monitors, where each trace has its own\nmonitor, and monitors for different traces can yield a unique verdict by\ncommunicating their observations. For both the centralized and the\ndecentralized settings, we provide a synthesis procedure that, given a formula,\nyields a monitor that is correct (i.e., sound and violation complete). A key\nstep in proving the correctness of the synthesis for decentralized monitors is\na result showing that, for each formula, the synthesized centralized monitor\nand its corresponding decentralized one are weakly bisimilar for a suitable\nnotion of weak bisimulation.\n","authors":["Luca Aceto","Antonis Achilleos","Elli Anastasiadi","Adrian Francalanza","Daniele Gorla","Jana Wagemaker"],"pdf_url":"https://arxiv.org/pdf/2405.12882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12700v1","updated":"2024-05-21T11:49:07Z","published":"2024-05-21T11:49:07Z","title":"Getting Wiser from Multiple Data: Probabilistic Updating according to\n Jeffrey and Pearl","summary":" In probabilistic updating one transforms a prior distribution in the light of\ngiven evidence into a posterior distribution, via what is called conditioning,\nupdating, belief revision or inference. This is the essence of learning, as\nBayesian updating. It will be illustrated via a physical model involving\n(adapted) water flows through pipes with different diameters.\n Bayesian updating makes us wiser, in the sense that the posterior\ndistribution makes the evidence more likely than the prior, since it\nincorporates the evidence. Things are less clear when one wishes to learn from\nmultiple pieces of evidence / data. It turns out that there are (at least) two\nforms of updating for this, associated with Jeffrey and Pearl. The difference\nis not always clearly recognised.\n This paper provides an introduction and an overview in the setting of\ndiscrete probability theory. It starts from an elementary question, involving\nmultiple pieces of evidence, that has been sent to a small group academic\nspecialists. Their answers show considerable differences. This is used as\nmotivation and starting point to introduce the two forms of updating, of\nJeffrey and Pearl, for multiple inputs and to elaborate their properties. In\nthe end the account is related to so-called variational free energy (VFE)\nupdate in the cognitive theory of predictive processing. It is shown that both\nJeffrey and Pearl outperform VFE updating and that VFE updating need not\ndecrease divergence - that is correct errors - as it is supposed to do.\n","authors":["Bart Jacobs"],"pdf_url":"https://arxiv.org/pdf/2405.12700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12654v1","updated":"2024-05-21T10:07:29Z","published":"2024-05-21T10:07:29Z","title":"Utilizing Description Logics for Global Explanations of Heterogeneous\n Graph Neural Networks","summary":" Graph Neural Networks (GNNs) are effective for node classification in\ngraph-structured data, but they lack explainability, especially at the global\nlevel. Current research mainly utilizes subgraphs of the input as local\nexplanations or generates new graphs as global explanations. However, these\ngraph-based methods are limited in their ability to explain classes with\nmultiple sufficient explanations. To provide more expressive explanations, we\npropose utilizing class expressions (CEs) from the field of description logic\n(DL). Our approach explains heterogeneous graphs with different types of nodes\nusing CEs in the EL description logic. To identify the best explanation among\nmultiple candidate explanations, we employ and compare two different scoring\nfunctions: (1) For a given CE, we construct multiple graphs, have the GNN make\na prediction for each graph, and aggregate the predicted scores. (2) We score\nthe CE in terms of fidelity, i.e., we compare the predictions of the GNN to the\npredictions by the CE on a separate validation set. Instead of subgraph-based\nexplanations, we offer CE-based explanations.\n","authors":["Dominik Köhler","Stefan Heindorf"],"pdf_url":"https://arxiv.org/pdf/2405.12654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12551v1","updated":"2024-05-21T07:37:31Z","published":"2024-05-21T07:37:31Z","title":"RA: A machine based rational agent, Part 1","summary":" RA is a software package that couples machine learning with formal reasoning\nin an attempt to find the laws that generate the empirical data that it has\nbeen given access to. A brief outline of RA in its initial stage of development\nis presented. Particular emphasis is given to current design strategies that\naim to endow RA with the ability to construct its own conjectures of which it\nconstructs proofs.\n","authors":["G. Pantelis"],"pdf_url":"https://arxiv.org/pdf/2405.12551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12426v1","updated":"2024-05-21T00:34:35Z","published":"2024-05-21T00:34:35Z","title":"Inferring Message Flows From System Communication Traces","summary":" This paper proposes a novel method for automatically inferring message flow\nspecifications from the communication traces of a system-on-chip (SoC) design\nthat captures messages exchanged among the components during a system\nexecution. The inferred message flows characterize the communication and\ncoordination of components in a system design for realizing various system\nfunctions, and they are essential for SoC validation and debugging. The\nproposed method relieves the burden of manual development and maintenance of\nsuch specifications on human designers. Our method also uses a new accuracy\nmetric, \\emph{acceptance ratio}, to evaluate the quality of the mined\nspecifications instead of the specification size often used in the previous\nwork, enabling more accurate specifications to be mined. Furthermore, this\npaper introduces the concept of essential causalities to enhance the accuracy\nof the message flow mining and accelerate the mining process. The effectiveness\nof the proposed method is evaluated on both synthetic traces and traces\ngenerated from executing several system models in GEM5. In both cases, the\nproposed method achieves superior accuracies compared to a previous approach.\nAdditionally, this paper includes some practical use cases.\n","authors":["Bardia Nadimi","Hao Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.12426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14494v2","updated":"2024-05-21T21:10:58Z","published":"2022-12-30T00:25:12Z","title":"Coinductive Streams in Monoidal Categories","summary":" We introduce monoidal streams. Monoidal streams are a generalization of\ncausal stream functions, which can be defined in cartesian monoidal categories,\nto arbitrary symmetric monoidal categories. In the same way that streams\nprovide semantics to dataflow programming with pure functions, monoidal streams\nprovide semantics to dataflow programming with theories of processes\nrepresented by a symmetric monoidal category. Monoidal streams also form a\nfeedback monoidal category. In the same way that we can use a coinductive\nstream calculus to reason about signal flow graphs, we can use coinductive\nstring diagrams to reason about feedback monoidal categories. As an example, we\nstudy syntax for a stochastic dataflow language, with semantics in stochastic\nmonoidal streams.\n","authors":["Elena Di Lavore","Giovanni de Felice","Mario Román"],"pdf_url":"https://arxiv.org/pdf/2212.14494v2.pdf","comment":"Expanded version of Monoidal Streams for Dataflow Programming,\n arXiv:2202.02061. We thank the reviewers at LMCS for multiple suggestions\n that have improved this version. 57 pages, 33 figures"}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2405.12849v1","updated":"2024-05-21T14:59:39Z","published":"2024-05-21T14:59:39Z","title":"Training and inference in the ReckON RSNN architecture implemented on a\n MPSoC","summary":" With the rise of artificial intelligence, biological neuron models are being\nused to implement neural networks that can learn certain tasks after a training\nphase. One type of such networks are spiking neural networks (SNNs) that rely\non a simplified model for biological neurons, the Integrate and Fire neuron.\nSeveral accelerators have emerged to implement SNNs with this kind of neuron.\nThe ReckON system is one of these that allows both the training and execution\nof a recurrent SNN. The ReckON architecture, implemented on a custom ASIC, can\nbe fully described using a hardware description language. In this work, we\nadapt the Verilog description to implement it on a Xilinx Multiprocessor System\non Chip system (MPSoC). We present the circuits required for the efficient\noperation of the system, and a Python framework to use it on the Pynq ZU\nplatform. We validate the architecture and implementation in two different\nscenarios, and show how the simulated accuracy is preserved with a peak\nperformance of 3.8M events processed per second.\n","authors":["Alejandro Linares-Barranco","Luciano Prono","Robert Lengenstein","Giacomo Indiveri","Charlotte Frenkel"],"pdf_url":"https://arxiv.org/pdf/2405.12849v1.pdf","comment":"Under review at ICECS'24"},{"id":"http://arxiv.org/abs/2403.18367v2","updated":"2024-05-21T13:23:02Z","published":"2024-03-27T08:58:32Z","title":"Merits of Time-Domain Computing for VMM -- A Quantitative Comparison","summary":" Vector-matrix-multiplication (VMM) accel-erators have gained a lot of\ntraction, especially due to therise of convolutional neural networks (CNNs) and\nthe desireto compute them on the edge. Besides the classical digitalapproach,\nanalog computing has gone through a renais-sance to push energy efficiency\nfurther. A more recent ap-proach is called time-domain (TD) computing. In\ncontrastto analog computing, TD computing permits easy technol-ogy as well as\nvoltage scaling. As it has received limitedresearch attention, it is not yet\nclear which scenarios aremost suitable to be computed in the TD. In this work,\nweinvestigate these scenarios, focussing on energy efficiencyconsidering\napproximative computations that preserve ac-curacy. Both goals are addressed by\na novel efficiency met-ric, which is used to find a baseline design. We use\nSPICEsimulation data which is fed into a python framework toevaluate how\nperformance scales for VMM computation.We see that TD computing offers best\nenergy efficiency forsmall to medium sized arrays. With throughput and sili-con\nfootprint we investigate two additional metrics, givinga holistic comparison.\n","authors":["Florian Freye","Jie Lou","Christian Lanius","Tobias Gemmeke"],"pdf_url":"https://arxiv.org/pdf/2403.18367v2.pdf","comment":"8 pages, 12 figures. This paper was accepted at the 25th\n International Symposium on Quality Electronic Design(ISQED) 2024. DOI:\n 10.1109/ISQED60706.2024.10528682"},{"id":"http://arxiv.org/abs/2405.12633v1","updated":"2024-05-21T09:38:56Z","published":"2024-05-21T09:38:56Z","title":"Automating Attendance Management in Human Resources: A Design Science\n Approach Using Computer Vision and Facial Recognition","summary":" Haar Cascade is a cost-effective and user-friendly machine learning-based\nalgorithm for detecting objects in images and videos. Unlike Deep Learning\nalgorithms, which typically require significant resources and expensive\ncomputing costs, it uses simple image processing techniques like edge detection\nand Haar features that are easy to comprehend and implement. By combining Haar\nCascade with OpenCV2 on an embedded computer like the NVIDIA Jetson Nano, this\nsystem can accurately detect and match faces in a database for attendance\ntracking. This system aims to achieve several specific objectives that set it\napart from existing solutions. It leverages Haar Cascade, enriched with\ncarefully selected Haar features, such as Haar-like wavelets, and employs\nadvanced edge detection techniques. These techniques enable precise face\ndetection and matching in both images and videos, contributing to high accuracy\nand robust performance. By doing so, it minimizes manual intervention and\nreduces errors, thereby strengthening accountability. Additionally, the\nintegration of OpenCV2 and the NVIDIA Jetson Nano optimizes processing\nefficiency, making it suitable for resource-constrained environments. This\nsystem caters to a diverse range of educational institutions, including\nschools, colleges, vocational training centers, and various workplace settings\nsuch as small businesses, offices, and factories. ... The system's\naffordability and efficiency democratize attendance management technology,\nmaking it accessible to a broader audience. Consequently, it has the potential\nto transform attendance tracking and management practices, ultimately leading\nto heightened productivity and accountability. In conclusion, this system\nrepresents a groundbreaking approach to attendance tracking and management...\n","authors":["Bao-Thien Nguyen-Tat","Minh-Quoc Bui","Vuong M. Ngo"],"pdf_url":"https://arxiv.org/pdf/2405.12633v1.pdf","comment":"31 pages, accepted to publish by the International Journal of\n Information Management Data Insights (IJIMDS) in 2024"},{"id":"http://arxiv.org/abs/2402.02576v2","updated":"2024-05-21T20:16:41Z","published":"2024-02-04T18:05:02Z","title":"Exploring the Design Space for Message-Driven Systems for Dynamic Graph\n Processing using CCA","summary":" Computer systems that have been successfully deployed for dense regular\nworkloads fall short of achieving scalability and efficiency when applied to\nirregular and dynamic graph applications. Conventional computing systems rely\nheavily on static, regular, numeric intensive computations while High\nPerformance Computing systems executing parallel graph applications exhibit\nlittle locality, spatial or temporal, and are fine-grained and memory\nintensive. With the strong interest in AI which depend on these very different\nuse cases combined with the end of Moore's Law at nanoscale, dramatic\nalternatives in architecture and underlying execution models are required. This\npaper identifies an innovative non-von Neumann architecture, Continuum Computer\nArchitecture (CCA), that redefines the nature of computing structures to yield\npowerful innovations in computational methods to deliver a new generation of\nhighly parallel hardware architecture. CCA reflects a genus of highly parallel\narchitectures that while varying in specific quantities (e.g., memory blocks),\nshare a multiple of attributes not found in typical von Neumann machines. Among\nthese are memory-centric components, message-driven asynchronous flow control,\nand lightweight out-of-order execution across a global name space. Together\nthese innovative non-von Neumann architectural properties guided by a new\noriginal execution model will deliver the new future path for extending beyond\nthe von Neumann model. This paper documents a series of interrelated\nexperiments that together establish future directions for next generation\nnon-von Neumann architectures, especially for graph processing.\n","authors":["Bibrak Qamar Chandio","Maciej Brodowicz","Thomas Sterling"],"pdf_url":"https://arxiv.org/pdf/2402.02576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14878v2","updated":"2024-05-21T20:05:51Z","published":"2024-02-21T21:02:11Z","title":"Energy-efficiency Limits on Training AI Systems using Learning-in-Memory","summary":" Learning-in-memory (LIM) is a recently proposed paradigm to overcome\nfundamental memory bottlenecks in training machine learning systems. While\ncompute-in-memory (CIM) approaches can address the so-called memory-wall (i.e.\nenergy dissipated due to repeated memory read access) they are agnostic to the\nenergy dissipated due to repeated memory writes at the precision required for\ntraining (the update-wall), and they don't account for the energy dissipated\nwhen transferring information between short-term and long-term memories (the\nconsolidation-wall). The LIM paradigm proposes that these bottlenecks, too, can\nbe overcome if the energy barrier of physical memories is adaptively modulated\nsuch that the dynamics of memory updates and consolidation match the Lyapunov\ndynamics of gradient-descent training of an AI model. In this paper, we derive\nnew theoretical lower bounds on energy dissipation when training AI systems\nusing different LIM approaches. The analysis presented here is model-agnostic\nand highlights the trade-off between energy efficiency and the speed of\ntraining. The resulting non-equilibrium energy-efficiency bounds have a similar\nflavor as that of Landauer's energy-dissipation bounds. We also extend these\nlimits by taking into account the number of floating-point operations (FLOPs)\nused for training, the size of the AI model, and the precision of the training\nparameters. Our projections suggest that the energy-dissipation lower-bound to\ntrain a brain scale AI system (comprising of $10^{15}$ parameters) using LIM is\n$10^8 \\sim 10^9$ Joules, which is on the same magnitude the Landauer's\nadiabatic lower-bound and $6$ to $7$ orders of magnitude lower than the\nprojections obtained using state-of-the-art AI accelerator hardware\nlower-bounds.\n","authors":["Zihao Chen","Johannes Leugering","Gert Cauwenberghs","Shantanu Chakrabartty"],"pdf_url":"https://arxiv.org/pdf/2402.14878v2.pdf","comment":"23 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.13170v1","updated":"2024-05-21T19:37:00Z","published":"2024-05-21T19:37:00Z","title":"FEATHER: A Reconfigurable Accelerator with Data Reordering Support for\n Low-Cost On-Chip Dataflow Switching","summary":" The inference of ML models composed of diverse structures, types, and sizes\nboils down to the execution of different dataflows (i.e. different tiling,\nordering, parallelism, and shapes). Using the optimal dataflow for every layer\nof workload can reduce latency by up to two orders of magnitude over a\nsuboptimal dataflow. Unfortunately, reconfiguring hardware for different\ndataflows involves on-chip data layout reordering and datapath\nreconfigurations, leading to non-trivial overhead that hinders ML accelerators\nfrom exploiting different dataflows, resulting in suboptimal performance. To\naddress this challenge, we propose FEATHER, an innovative accelerator that\nleverages a novel spatial array termed Nest and a novel multi-stage reduction\nnetwork called BIRRD for performing flexible data reduction with layout\nreordering under the hood, enabling seamless switching between optimal\ndataflows with negligible latency and resources overhead. For systematically\nevaluating the performance interaction between dataflows and layouts, we\nenhance Timeloop, a state-of-the-art dataflow cost modeling and search\nframework, with layout assessment capabilities, and term it as Layoutloop. We\nmodel FEATHER into Layoutloop and also deploy FEATHER end-to-end on the edge\nZCU104 FPGA. FEATHER delivers 1.27~2.89x inference latency speedup and\n1.3~6.43x energy efficiency improvement compared to various SoTAs like NVDLA,\nSIGMA and Eyeriss under ResNet-50 and MobiletNet-V3 in Layoutloop. On practical\nFPGA devices, FEATHER achieves 2.65/3.91x higher throughput than Xilinx\nDPU/Gemmini. Remarkably, such performance and energy efficiency enhancements\ncome at only 6% area over a fixed-dataflow Eyeriss-like accelerator. Our code\nis released at https://github.com/maeri-project/FEATHER.\n","authors":["Jianming Tong","Anirudh Itagi","Prasanth Chatarasi","Tushar Krishna"],"pdf_url":"https://arxiv.org/pdf/2405.13170v1.pdf","comment":"17 pages, 14 figures. International Symposium on Computer\n Architecture (ISCA), Jun 2024"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2405.12921v1","updated":"2024-05-21T16:45:20Z","published":"2024-05-21T16:45:20Z","title":"Is decidability of the Submonoid Membership Problem closed under finite\n extensions?","summary":" We show that the rational subset membership problem in $G$ can be reduced to\nthe submonoid membership problem in $G{\\times}H$ where $H$ is virtually\nAbelian. We use this to show that there is no algorithm reducing submonoid\nmembership to a finite index subgroup uniformly for all virtually nilpotent\ngroups. We also provide evidence towards the existence of a group $G$ with a\nsubgroup $H 0$ such that\nany depth-$k$ Frege refutation of a random $n$-variable 3-CNF with $\\Theta(n)$\nclauses has $\\Omega(n^{1 + \\epsilon})$ steps w.h.p. Our proof involves a novel\nadaptation of the deterministic restriction technique of Chaudhuri and\nRadhakrishnan (STOC'96).\n","authors":["Svyatoslav Gryaznov","Navid Talebanfard"],"pdf_url":"https://arxiv.org/pdf/2403.02275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11761v3","updated":"2024-05-22T15:44:40Z","published":"2023-01-27T14:59:21Z","title":"A Strongly Polynomial-Time Algorithm for Weighted General Factors with\n Three Feasible Degrees","summary":" General factors are a generalization of matchings. Given a graph $G$ with a\nset $\\pi(v)$ of feasible degrees, called a degree constraint, for each vertex\n$v$ of $G$, the general factor problem is to find a (spanning) subgraph $F$ of\n$G$ such that $\\text{deg}_F(x) \\in \\pi(v)$ for every $v$ of $G$. When all\ndegree constraints are symmetric $\\Delta$-matroids, the problem is solvable in\npolynomial time. The weighted general factor problem is to find a general\nfactor of the maximum total weight in an edge-weighted graph. In this paper, we\npresent the first strongly polynomial-time algorithm for a type of weighted\ngeneral factor problems with real-valued edge weights that is provably not\nreducible to the weighted matching problem by gadget constructions.\n","authors":["Shuai Shao","Stanislav Živný"],"pdf_url":"https://arxiv.org/pdf/2301.11761v3.pdf","comment":"This is a full version of an ISAAC 2023 paper"},{"id":"http://arxiv.org/abs/2306.00420v2","updated":"2024-05-22T13:33:10Z","published":"2023-06-01T07:51:03Z","title":"Logics with probabilistic team semantics and the Boolean negation","summary":" We study the expressivity and the complexity of various logics in\nprobabilistic team semantics with the Boolean negation. In particular, we study\nthe extension of probabilistic independence logic with the Boolean negation,\nand a recently introduced logic FOPT. We give a comprehensive picture of the\nrelative expressivity of these logics together with the most studied logics in\nprobabilistic team semantics setting, as well as relating their expressivity to\na numerical variant of second-order logic. In addition, we introduce novel\nentropy atoms and show that the extension of first-order logic by entropy atoms\nsubsumes probabilistic independence logic. Finally, we obtain some results on\nthe complexity of model checking, validity, and satisfiability of our logics.\n","authors":["Miika Hannula","Minna Hirvonen","Juha Kontinen","Yasir Mahmood","Arne Meier","Jonni Virtema"],"pdf_url":"https://arxiv.org/pdf/2306.00420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04749v2","updated":"2024-05-22T03:08:39Z","published":"2023-02-09T16:31:48Z","title":"Quantum Advantage from One-Way Functions","summary":" We demonstrate quantum advantage with several basic assumptions, specifically\nbased on only the existence of OWFs. We introduce inefficient-verifier proofs\nof quantumness (IV-PoQ), and construct it from classical bit commitments.\nIV-PoQ is an interactive protocol between a verifier and a quantum prover\nconsisting of two phases. In the first phase, the verifier is probabilistic\npolynomial-time, and it interacts with the prover. In the second phase, the\nverifier becomes inefficient, and makes its decision based on the transcript of\nthe first phase. If the prover is honest, the inefficient verifier accepts with\nhigh probability, but any classical malicious prover only has a small\nprobability of being accepted by the inefficient verifier. Our construction\ndemonstrates the following results: (1)If one-way functions exist, then IV-PoQ\nexist. (2)If distributional collision-resistant hash functions exist (which\nexist if hard-on-average problems in $\\mathbf{SZK}$ exist), then constant-round\nIV-PoQ exist. We also demonstrate quantum advantage based on worst-case-hard\nassumptions. We define auxiliary-input IV-PoQ (AI-IV-PoQ) that only require\nthat for any malicious prover, there exist infinitely many auxiliary inputs\nunder which the prover cannot cheat. We construct AI-IV-PoQ from an\nauxiliary-input version of commitments in a similar way, showing that (1)If\nauxiliary-input one-way functions exist (which exist if\n$\\mathbf{CZK}\\not\\subseteq\\mathbf{BPP}$), then AI-IV-PoQ exist. (2)If\nauxiliary-input collision-resistant hash functions exist (which is equivalent\nto $\\mathbf{PWPP}\\nsubseteq \\mathbf{FBPP}$) or $\\mathbf{SZK}\\nsubseteq\n\\mathbf{BPP}$, then constant-round AI-IV-PoQ exist.\n","authors":["Tomoyuki Morimae","Takashi Yamakawa"],"pdf_url":"https://arxiv.org/pdf/2302.04749v2.pdf","comment":"52pages"},{"id":"http://arxiv.org/abs/2405.13273v1","updated":"2024-05-22T01:08:54Z","published":"2024-05-22T01:08:54Z","title":"Dequantizability from inputs","summary":" By comparing constructions of block encoding given by [1-4], we propose a way\nto extract dequantizability from advancements in dequantization techniques that\nhave been led by Tang, as in [5]. Then we apply this notion to the\nsparse-access input model that is known to be BQP-complete in general, thereby\nconceived to be un-dequantizable. Our goal is to break down this belief by\nexamining the sparse-access input model's instances, particularly their input\nmatrices. In conclusion, this paper forms a dequantizability-verifying scheme\nthat can be applied whenever an input is given.\n","authors":["Tae-Won Kim","Byung-Soo Choi"],"pdf_url":"https://arxiv.org/pdf/2405.13273v1.pdf","comment":null}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2405.12525v2","updated":"2024-05-22T10:22:09Z","published":"2024-05-21T06:30:00Z","title":"Cache Blocking of Distributed-Memory Parallel Matrix Power Kernels","summary":" Sparse matrix-vector products (SpMVs) are a bottleneck in many scientific\ncodes. Due to the heavy strain on the main memory interface from loading the\nsparse matrix and the possibly irregular memory access pattern, SpMV typically\nexhibits low arithmetic intensity. Repeating these products multiple times with\nthe same matrix is required in many algorithms. This so-called matrix power\nkernel (MPK) provides an opportunity for data reuse since the same matrix data\nis loaded from main memory multiple times, an opportunity that has only\nrecently been exploited successfully with the Recursive Algebraic Coloring\nEngine (RACE). Using RACE, one considers a graph based formulation of the SpMV\nand employs s level-based implementation of SpMV for reuse of relevant matrix\ndata. However, the underlying data dependencies have restricted the use of this\nconcept to shared memory parallelization and thus to single compute nodes.\nEnabling cache blocking for distributed-memory parallelization of MPK is\nchallenging due to the need for explicit communication and synchronization of\ndata in neighboring levels. In this work, we propose and implement a flexible\nmethod that interleaves the cache-blocking capabilities of RACE with an MPI\ncommunication scheme that fulfills all data dependencies among processes.\nCompared to a \"traditional\" distributed memory parallel MPK, our new\nDistributed Level-Blocked MPK yields substantial speed-ups on modern Intel and\nAMD architectures across a wide range of sparse matrices from various\nscientific applications. Finally, we address a modern quantum physics problem\nto demonstrate the applicability of our method, achieving a speed-up of up to\n4x on 832 cores of an Intel Sapphire Rapids cluster.\n","authors":["Dane C. Lacey","Christie L. Alappat","Florian Lange","Georg Hager","Holger Fehske","Gerhard Wellein"],"pdf_url":"https://arxiv.org/pdf/2405.12525v2.pdf","comment":"15 pages, 12 figures, 5 tables; added affiliation & extended\n acknowledgment"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2402.17604v3","updated":"2024-05-22T16:27:15Z","published":"2024-02-27T15:31:00Z","title":"Equivariant ideals of polynomials","summary":" We study existence and computability of finite bases for ideals of\npolynomials over infinitely many variables. In our setting, variables come from\na countable logical structure A, and embeddings from A to A act on polynomials\nby renaming variables. First, we give a sufficient and necessary condition for\nA to guarantee the following generalisation of Hilbert's Basis Theorem: every\npolynomial ideal which is equivariant, i.e. invariant under renaming of\nvariables, is finitely generated. Second, we develop an extension of classical\nBuchberger's algorithm to compute a Gr\\\"obner basis of a given equivariant\nideal. This implies decidability of the membership problem for equivariant\nideals. Finally, we sketch upon various applications of these results to\nregister automata, Petri nets with data, orbit-finitely generated vector\nspaces, and orbit-finite systems of linear equations.\n","authors":["Arka Ghosh","Sławomir Lasota"],"pdf_url":"https://arxiv.org/pdf/2402.17604v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13897v3","updated":"2024-05-22T18:07:48Z","published":"2023-10-21T03:26:39Z","title":"Masked Hard-Attention Transformers Recognize Exactly the Star-Free\n Languages","summary":" The expressive power of transformers over inputs of unbounded size can be\nstudied through their ability to recognize classes of formal languages. We\nconsider transformer encoders with hard attention (in which all attention is\nfocused on exactly one position) and strict future masking (in which each\nposition only attends to positions strictly to its left), and prove that they\nare equivalent to linear temporal logic (LTL), which defines exactly the\nstar-free languages. A key technique is the use of Boolean RASP as a convenient\nintermediate language between transformers and LTL. We then take numerous\nresults known for LTL and apply them to transformers, characterizing how\nposition embeddings, strict masking, and depth increase expressive power.\n","authors":["Andy Yang","David Chiang","Dana Angluin"],"pdf_url":"https://arxiv.org/pdf/2310.13897v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17444v2","updated":"2024-05-22T13:07:13Z","published":"2024-01-30T21:16:31Z","title":"Languages of Higher-Dimensional Timed Automata","summary":" We present a new language semantics for real-time concurrency. Its\noperational models are higher-dimensional timed automata (HDTAs), a\ngeneralization of both higher-dimensional automata and timed automata. We\ndefine languages of HDTAs as sets of interval-timed pomsets with interfaces. As\nan application, we show that language inclusion of HDTAs is undecidable. On the\nother hand, using a region construction we can show that untimings of HDTA\nlanguages have enough regularity so that untimed language inclusion is\ndecidable.\n","authors":["Amazigh Amrane","Hugo Bazille","Emily Clement","Uli Fahrenberg"],"pdf_url":"https://arxiv.org/pdf/2401.17444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13488v1","updated":"2024-05-22T09:57:49Z","published":"2024-05-22T09:57:49Z","title":"Non-Deterministic Planning for Hyperproperty Verification","summary":" Non-deterministic planning aims to find a policy that achieves a given\nobjective in an environment where actions have uncertain effects, and the agent\n- potentially - only observes parts of the current state. Hyperproperties are\nproperties that relate multiple paths of a system and can, e.g., capture\nsecurity and information-flow policies. Popular logics for expressing temporal\nhyperproperties - such as HyperLTL - extend LTL by offering selective\nquantification over executions of a system. In this paper, we show that\nplanning offers a powerful intermediate language for the automated verification\nof hyperproperties. Concretely, we present an algorithm that, given a HyperLTL\nverification problem, constructs a non-deterministic multi-agent planning\ninstance (in the form of a QDec-POMDP) that, when admitting a plan, implies the\nsatisfaction of the verification problem. We show that for large fragments of\nHyperLTL, the resulting planning instance corresponds to a classical, FOND, or\nPOND planning problem. We implement our encoding in a prototype verification\ntool and report on encouraging experimental results.\n","authors":["Raven Beutner","Bernd Finkbeiner"],"pdf_url":"https://arxiv.org/pdf/2405.13488v1.pdf","comment":"ICAPS 2024"},{"id":"http://arxiv.org/abs/2206.14273v3","updated":"2024-05-22T14:29:46Z","published":"2022-06-28T20:04:24Z","title":"Asymptotic bounds for the number of closed and privileged words","summary":" A word~$w$ has a border $u$ if $u$ is a non-empty proper prefix and suffix of\n$u$. A word~$w$ is said to be \\emph{closed} if $w$ is of length at most $1$ or\nif $w$ has a border that occurs exactly twice in $w$. A word~$w$ is said to be\n\\emph{privileged} if $w$ is of length at most $1$ or if $w$ has a privileged\nborder that occurs exactly twice in $w$. Let $C_k(n)$ (resp.~$P_k(n)$) be the\nnumber of length-$n$ closed (resp. privileged) words over a $k$-letter\nalphabet. In this paper, we improve existing upper and lower bounds on $C_k(n)$\nand $P_k(n)$. We completely resolve the asymptotic behaviour of $C_k(n)$. We\nalso nearly completely resolve the asymptotic behaviour of $P_k(n)$ by giving a\nfamily of upper and lower bounds that are separated by a factor that grows\narbitrarily slowly.\n","authors":["Daniel Gabric"],"pdf_url":"https://arxiv.org/pdf/2206.14273v3.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2310.13897v3","updated":"2024-05-22T18:07:48Z","published":"2023-10-21T03:26:39Z","title":"Masked Hard-Attention Transformers Recognize Exactly the Star-Free\n Languages","summary":" The expressive power of transformers over inputs of unbounded size can be\nstudied through their ability to recognize classes of formal languages. We\nconsider transformer encoders with hard attention (in which all attention is\nfocused on exactly one position) and strict future masking (in which each\nposition only attends to positions strictly to its left), and prove that they\nare equivalent to linear temporal logic (LTL), which defines exactly the\nstar-free languages. A key technique is the use of Boolean RASP as a convenient\nintermediate language between transformers and LTL. We then take numerous\nresults known for LTL and apply them to transformers, characterizing how\nposition embeddings, strict masking, and depth increase expressive power.\n","authors":["Andy Yang","David Chiang","Dana Angluin"],"pdf_url":"https://arxiv.org/pdf/2310.13897v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17604v3","updated":"2024-05-22T16:27:15Z","published":"2024-02-27T15:31:00Z","title":"Equivariant ideals of polynomials","summary":" We study existence and computability of finite bases for ideals of\npolynomials over infinitely many variables. In our setting, variables come from\na countable logical structure A, and embeddings from A to A act on polynomials\nby renaming variables. First, we give a sufficient and necessary condition for\nA to guarantee the following generalisation of Hilbert's Basis Theorem: every\npolynomial ideal which is equivariant, i.e. invariant under renaming of\nvariables, is finitely generated. Second, we develop an extension of classical\nBuchberger's algorithm to compute a Gr\\\"obner basis of a given equivariant\nideal. This implies decidability of the membership problem for equivariant\nideals. Finally, we sketch upon various applications of these results to\nregister automata, Petri nets with data, orbit-finitely generated vector\nspaces, and orbit-finite systems of linear equations.\n","authors":["Arka Ghosh","Sławomir Lasota"],"pdf_url":"https://arxiv.org/pdf/2402.17604v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13736v1","updated":"2024-05-22T15:29:40Z","published":"2024-05-22T15:29:40Z","title":"Towards Counting Markov Equivalence Classes with Logical Constraints","summary":" We initiate the study of counting Markov Equivalence Classes (MEC) under\nlogical constraints. MECs are equivalence classes of Directed Acyclic Graphs\n(DAGs) that encode the same conditional independence structure among the random\nvariables of a DAG model. Observational data can only allow to infer a DAG\nmodel up to Markov Equivalence. However, Markov equivalent DAGs can represent\ndifferent causal structures, potentially super-exponentially many. Hence,\nunderstanding MECs combinatorially is critical to understanding the complexity\nof causal inference. In this paper, we focus on analysing MECs of size one,\nwith logical constraints on the graph topology. We provide a polynomial-time\nalgorithm (w.r.t. the number of nodes) for enumerating essential DAGs (the only\nmembers of an MEC of size one) with arbitrary logical constraints expressed in\nfirst-order logic with two variables and counting quantifiers (C^2). Our work\nbrings together recent developments in tractable first-order model counting and\ncombinatorics of MECs.\n","authors":["Davide Bizzaro","Luciano Serafini","Sagar Malhotra"],"pdf_url":"https://arxiv.org/pdf/2405.13736v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2405.13715v1","updated":"2024-05-22T15:06:50Z","published":"2024-05-22T15:06:50Z","title":"Traffic Scenario Logic: A Spatial-Temporal Logic for Modeling and\n Reasoning of Urban Traffic Scenarios","summary":" Formal representations of traffic scenarios can be used to generate test\ncases for the safety verification of autonomous driving. However, most existing\nmethods are limited in highway or highly simplified intersection scenarios due\nto the intricacy and diversity of traffic scenarios. In response, we propose\nTraffic Scenario Logic (TSL), which is a spatial-temporal logic designed for\nmodeling and reasoning of urban pedestrian-free traffic scenarios. TSL provides\na formal representation of the urban road network that can be derived from\nOpenDRIVE, i.e., the de facto industry standard of high-definition maps for\nautonomous driving, enabling the representation of a broad range of traffic\nscenarios. We implemented the reasoning of TSL using Telingo, i.e., a solver\nfor temporal programs based on the Answer Set Programming, and tested it on\ndifferent urban road layouts. Demonstrations show the effectiveness of TSL in\ntest scenario generation and its potential value in areas like decision-making\nand control verification of autonomous driving.\n","authors":["Ruolin Wang","Yuejiao Xu","Jianmin Ji"],"pdf_url":"https://arxiv.org/pdf/2405.13715v1.pdf","comment":"Submitted to KR 2024"},{"id":"http://arxiv.org/abs/2405.13704v1","updated":"2024-05-22T14:49:06Z","published":"2024-05-22T14:49:06Z","title":"Safe and Personalizable Logical Guidance for Trajectory Planning of\n Autonomous Driving","summary":" Autonomous vehicles necessitate a delicate balance between safety,\nefficiency, and user preferences in trajectory planning. Existing traditional\nor learning-based methods face challenges in adequately addressing all these\naspects. In response, this paper proposes a novel component termed the Logical\nGuidance Layer (LGL), designed for seamless integration into autonomous driving\ntrajectory planning frameworks, specifically tailored for highway scenarios.\nThe LGL guides the trajectory planning with a local target area determined\nthrough scenario reasoning, scenario evaluation, and guidance area calculation.\nIntegrating the Responsibility-Sensitive Safety (RSS) model, the LGL ensures\nformal safety guarantees while accommodating various user preferences defined\nby logical formulae. Experimental validation demonstrates the effectiveness of\nthe LGL in achieving a balance between safety and efficiency, and meeting user\npreferences in autonomous highway driving scenarios.\n","authors":["Yuejiao Xu","Ruolin Wang","Chengpeng Xu","Jianmin Ji"],"pdf_url":"https://arxiv.org/pdf/2405.13704v1.pdf","comment":"Submitted to ITSC 2024"},{"id":"http://arxiv.org/abs/2405.13697v1","updated":"2024-05-22T14:41:47Z","published":"2024-05-22T14:41:47Z","title":"The complexity of deciding characteristic formulae in van Glabbeek's\n branching-time spectrum","summary":" Characteristic formulae give a complete logical description of the behaviour\nof processes modulo some chosen notion of behavioural semantics. They allow one\nto reduce equivalence or preorder checking to model checking, and are exactly\nthe formulae in the modal logics characterizing classic behavioural\nequivalences and preorders for which model checking can be reduced to\nequivalence or preorder checking.\n This paper studies the complexity of determining whether a formula is\ncharacteristic for some finite, loop-free process in each of the logics\nproviding modal characterizations of the simulation-based semantics in van\nGlabbeek's branching-time spectrum. Since characteristic formulae in each of\nthose logics are exactly the consistent and prime ones, it presents complexity\nresults for the satisfiability and primality problems, and investigates the\nboundary between modal logics for which those problems can be solved in\npolynomial time and those for which they become computationally hard.\n Amongst other contributions, this article also studies the complexity of\nconstructing characteristic formulae in the modal logics characterizing\nsimulation-based semantics, both when such formulae are presented in explicit\nform and via systems of equations.\n","authors":["Luca Aceto","Antonis Achilleos","Aggeliki Chalki","Anna Ingolfsdottir"],"pdf_url":"https://arxiv.org/pdf/2405.13697v1.pdf","comment":"64 pages, 1 figure"},{"id":"http://arxiv.org/abs/2306.00420v2","updated":"2024-05-22T13:33:10Z","published":"2023-06-01T07:51:03Z","title":"Logics with probabilistic team semantics and the Boolean negation","summary":" We study the expressivity and the complexity of various logics in\nprobabilistic team semantics with the Boolean negation. In particular, we study\nthe extension of probabilistic independence logic with the Boolean negation,\nand a recently introduced logic FOPT. We give a comprehensive picture of the\nrelative expressivity of these logics together with the most studied logics in\nprobabilistic team semantics setting, as well as relating their expressivity to\na numerical variant of second-order logic. In addition, we introduce novel\nentropy atoms and show that the extension of first-order logic by entropy atoms\nsubsumes probabilistic independence logic. Finally, we obtain some results on\nthe complexity of model checking, validity, and satisfiability of our logics.\n","authors":["Miika Hannula","Minna Hirvonen","Juha Kontinen","Yasir Mahmood","Arne Meier","Jonni Virtema"],"pdf_url":"https://arxiv.org/pdf/2306.00420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17444v2","updated":"2024-05-22T13:07:13Z","published":"2024-01-30T21:16:31Z","title":"Languages of Higher-Dimensional Timed Automata","summary":" We present a new language semantics for real-time concurrency. Its\noperational models are higher-dimensional timed automata (HDTAs), a\ngeneralization of both higher-dimensional automata and timed automata. We\ndefine languages of HDTAs as sets of interval-timed pomsets with interfaces. As\nan application, we show that language inclusion of HDTAs is undecidable. On the\nother hand, using a region construction we can show that untimings of HDTA\nlanguages have enough regularity so that untimed language inclusion is\ndecidable.\n","authors":["Amazigh Amrane","Hugo Bazille","Emily Clement","Uli Fahrenberg"],"pdf_url":"https://arxiv.org/pdf/2401.17444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13583v1","updated":"2024-05-22T12:24:18Z","published":"2024-05-22T12:24:18Z","title":"Tools at the Frontiers of Quantitative Verification","summary":" The analysis of formal models that include quantitative aspects such as\ntiming or probabilistic choices is performed by quantitative verification\ntools. Broad and mature tool support is available for computing basic\nproperties such as expected rewards on basic models such as Markov chains.\nPrevious editions of QComp, the comparison of tools for the analysis of\nquantitative formal models, focused on this setting. Many application\nscenarios, however, require more advanced property types such as LTL and\nparameter synthesis queries as well as advanced models like stochastic games\nand partially observable MDPs. For these, tool support is in its infancy today.\nThis paper presents the outcomes of QComp 2023: a survey of the state of the\nart in quantitative verification tool support for advanced property types and\nmodels. With tools ranging from first research prototypes to well-supported\nintegrations into established toolsets, this report highlights today's active\nareas and tomorrow's challenges in tool-focused research for quantitative\nverification.\n","authors":["Roman Andriushchenko","Alexander Bork","Carlos E. Budde","Milan Češka","Kush Grover","Ernst Moritz Hahn","Arnd Hartmanns","Bryant Israelsen","Nils Jansen","Joshua Jeppson","Sebastian Junges","Maximilian A. Köhl","Bettina Könighofer","Jan Křetínský","Tobias Meggendorfer","David Parker","Stefan Pranger","Tim Quatmann","Enno Ruijters","Landon Taylor","Matthias Volk","Maximilian Weininger","Zhen Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.13583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13488v1","updated":"2024-05-22T09:57:49Z","published":"2024-05-22T09:57:49Z","title":"Non-Deterministic Planning for Hyperproperty Verification","summary":" Non-deterministic planning aims to find a policy that achieves a given\nobjective in an environment where actions have uncertain effects, and the agent\n- potentially - only observes parts of the current state. Hyperproperties are\nproperties that relate multiple paths of a system and can, e.g., capture\nsecurity and information-flow policies. Popular logics for expressing temporal\nhyperproperties - such as HyperLTL - extend LTL by offering selective\nquantification over executions of a system. In this paper, we show that\nplanning offers a powerful intermediate language for the automated verification\nof hyperproperties. Concretely, we present an algorithm that, given a HyperLTL\nverification problem, constructs a non-deterministic multi-agent planning\ninstance (in the form of a QDec-POMDP) that, when admitting a plan, implies the\nsatisfaction of the verification problem. We show that for large fragments of\nHyperLTL, the resulting planning instance corresponds to a classical, FOND, or\nPOND planning problem. We implement our encoding in a prototype verification\ntool and report on encouraging experimental results.\n","authors":["Raven Beutner","Bernd Finkbeiner"],"pdf_url":"https://arxiv.org/pdf/2405.13488v1.pdf","comment":"ICAPS 2024"},{"id":"http://arxiv.org/abs/2405.13461v1","updated":"2024-05-22T09:02:12Z","published":"2024-05-22T09:02:12Z","title":"Analogical proportions II","summary":" Analogical reasoning is the ability to detect parallels between two seemingly\ndistant objects or situations, a fundamental human capacity used for example in\ncommonsense reasoning, learning, and creativity which is believed by many\nresearchers to be at the core of human and artificial general intelligence.\nAnalogical proportions are expressions of the form ``$a$ is to $b$ what $c$ is\nto $d$'' at the core of analogical reasoning. The author has recently\nintroduced an abstract algebraic framework of analogical proportions within the\ngeneral setting of universal algebra. It is the purpose of this paper to\nfurther develop the mathematical theory of analogical proportions within that\nframework as motivated by the fact that it has already been successfully\napplied to logic program synthesis in artificial intelligence.\n","authors":["Christian Antić"],"pdf_url":"https://arxiv.org/pdf/2405.13461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13435v1","updated":"2024-05-22T08:24:27Z","published":"2024-05-22T08:24:27Z","title":"A Coherence Construction for the Propositional Universe","summary":" We record a particularly simple construction on top of Lumsdaine's local\nuniverses that allows for a Coquand-style universe of propositions with\npropositional extensionality to be interpreted in a category with subobject\nclassifiers.\n","authors":["Xu Huang"],"pdf_url":"https://arxiv.org/pdf/2405.13435v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2405.13416v1","updated":"2024-05-22T07:50:43Z","published":"2024-05-22T07:50:43Z","title":"Source-level reasoning for quantitative information flow","summary":" We present a novel formal system for proving quantitative-leakage properties\nof programs. Based on a theory of Quantitative Information Flow (QIF) that\nmodels information leakage as a noisy communication channel, it uses\n\"gain-functions\" for the description and measurement of expected leaks.\n We use a small imperative programming language, augmented with leakage\nfeatures, and with it express adversaries' activities in the style of, but more\ngenerally than, the Hoare triples or expectation transformers that\ntraditionally express deterministic or probabilistic correctness but without\ninformation flow.\n The programs are annotated with \"gain-expressions\" that capture simple\nadversarial settings such as \"Guess the secret in one try.\" but also much more\ngeneral ones; and our formal syntax and logic -based framework enables us to\ntransform such gain-expressions that apply after a program has finished to ones\nthat equivalently apply before the program has begun.\n In that way we enable a formal proof-based reasoning system for QIF at the\nsource level. We apply it to the %programming language we have chosen, and\ndemonstrate its effectiveness in a number of small but sometimes intricate\nsituations.\n","authors":["Chris Chen","Annabelle McIver","Carroll Morgan"],"pdf_url":"https://arxiv.org/pdf/2405.13416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08308v2","updated":"2024-05-22T02:57:19Z","published":"2024-04-12T08:06:32Z","title":"Composing Codensity Bisimulations","summary":" Proving compositionality of behavioral equivalence on state-based systems\nwith respect to algebraic operations is a classical and widely studied problem.\nWe study a categorical formulation of this problem, where operations on\nstate-based systems modeled as coalgebras can be elegantly captured through\ndistributive laws between functors. To prove compositionality, it then suffices\nto show that this distributive law lifts from sets to relations, giving an\nexplanation of how behavioral equivalence on smaller systems can be combined to\nobtain behavioral equivalence on the composed system.\n In this paper, we refine this approach by focusing on so-called codensity\nlifting of functors, which gives a very generic presentation of various notions\nof (bi)similarity as well as quantitative notions such as behavioral metrics on\nprobabilistic systems. The key idea is to use codensity liftings both at the\nlevel of algebras and coalgebras, using a new generalization of the codensity\nlifting. The problem of lifting distributive laws then reduces to the abstract\nproblem of constructing distributive laws between codensity liftings, for which\nwe propose a simplified sufficient condition. Our sufficient condition\ninstantiates to concrete proof methods for compositionality of algebraic\noperations on various types of state-based systems. We instantiate our results\nto prove compositionality of qualitative and quantitative properties of\ndeterministic automata. We also explore the limits of our approach by including\nan example of probabilistic systems, where it is unclear whether the sufficient\ncondition holds, and instead we use our setting to give a direct proof of\ncompositionality. ...\n","authors":["Mayuko Kori","Kazuki Watanabe","Jurriaan Rot","Shin-ya Katsumata"],"pdf_url":"https://arxiv.org/pdf/2404.08308v2.pdf","comment":"Extended version (includes the Appendix) of the paper accepted at\n LiCS-24"},{"id":"http://arxiv.org/abs/2405.13271v1","updated":"2024-05-22T00:51:38Z","published":"2024-05-22T00:51:38Z","title":"Verifying Lock-free Search Structure Templates","summary":" We present and verify template algorithms for lock-free concurrent search\nstructures that cover a broad range of existing implementations based on lists\nand skiplists. Our linearizability proofs are fully mechanized in the\nconcurrent separation logic Iris. The proofs are modular and cover the broader\ndesign space of the underlying algorithms by parameterizing the verification\nover aspects such as the low-level representation of nodes and the style of\ndata structure maintenance. As a further technical contribution, we present a\nmechanization of a recently proposed method for reasoning about\nfuture-dependent linearization points using hindsight arguments. The\nmechanization builds on Iris' support for prophecy reasoning and user-defined\nghost resources. We demonstrate that the method can help to reduce the proof\neffort compared to direct prophecy-based proofs.\n","authors":["Nisarg Patel","Dennis Shasha","Thomas Wies"],"pdf_url":"https://arxiv.org/pdf/2405.13271v1.pdf","comment":"Extended version of an article to appear in ECOOP'24"}]},"2024-05-23T00:00:00Z":{"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2311.02650v3","updated":"2024-05-23T13:47:07Z","published":"2023-11-05T13:37:15Z","title":"Ephemeral Rollups are All you Need","summary":" In the realm of open and composable gaming, we envision platforms where users\nactively expand, create, engage, and immerse themselves in a rich world of\nentertainment. One promising avenue for achieving this vision is through fully\non-chain (FOC) games, where both game state and logic reside on the blockchain,\nmaximizing composability. However, we must grapple with inherent limitations\nand trade-offs, particularly in terms of costs and scalability. This paper\nproposes a framework that leverages the Solana Virtual Machine (SVM) to scale\nFOC games without state fragmentation or compromised trust assumptions. The\nframework introduces a systematic approach for discovering, utilizing, and\npublishing modular pieces of logic as components deeply rooted in the\nEntity-Component-System (ECS) pattern. To enhance scalability and resource\noptimization, we introduce the concept of Ephemeral Rollups (ERs) that overcome\nthe tradeoffs of L2s horizontal scaling. These dedicated runtimes can be\ncustomized to provide higher operational speed, configurable ticking\nmechanisms, provable sessions and gasless transactions without\ncomposability-scalability tradeoffs.\n","authors":["Gabriele Picco","Andrea Fortugno"],"pdf_url":"https://arxiv.org/pdf/2311.02650v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14791v1","updated":"2024-05-23T17:01:53Z","published":"2024-05-23T17:01:53Z","title":"Recurrent Early Exits for Federated Learning with Heterogeneous Clients","summary":" Federated learning (FL) has enabled distributed learning of a model across\nmultiple clients in a privacy-preserving manner. One of the main challenges of\nFL is to accommodate clients with varying hardware capacities; clients have\ndiffering compute and memory requirements. To tackle this challenge, recent\nstate-of-the-art approaches leverage the use of early exits. Nonetheless, these\napproaches fall short of mitigating the challenges of joint learning multiple\nexit classifiers, often relying on hand-picked heuristic solutions for\nknowledge distillation among classifiers and/or utilizing additional layers for\nweaker classifiers. In this work, instead of utilizing multiple classifiers, we\npropose a recurrent early exit approach named ReeFL that fuses features from\ndifferent sub-models into a single shared classifier. Specifically, we use a\ntransformer-based early-exit module shared among sub-models to i) better\nexploit multi-layer feature representations for task-specific prediction and\nii) modulate the feature representation of the backbone model for subsequent\npredictions. We additionally present a per-client self-distillation approach\nwhere the best sub-model is automatically selected as the teacher of the other\nsub-models at each client. Our experiments on standard image and speech\nclassification benchmarks across various emerging federated fine-tuning\nbaselines demonstrate ReeFL's effectiveness over previous works.\n","authors":["Royson Lee","Javier Fernandez-Marques","Shell Xu Hu","Da Li","Stefanos Laskaridis","Łukasz Dudziak","Timothy Hospedales","Ferenc Huszár","Nicholas D. Lane"],"pdf_url":"https://arxiv.org/pdf/2405.14791v1.pdf","comment":"Accepted at the 41st International Conference on Machine Learning\n (ICML 2024)"},{"id":"http://arxiv.org/abs/2404.10087v2","updated":"2024-05-23T16:14:49Z","published":"2024-04-15T18:50:44Z","title":"cuFastTuckerPlus: A Stochastic Parallel Sparse FastTucker Decomposition\n Using GPU Tensor Cores","summary":" Sparse tensors are prevalent in real-world applications, often characterized\nby their large-scale, high-order, and high-dimensional nature. Directly\nhandling raw tensors is impractical due to the significant memory and\ncomputational overhead involved. The current mainstream approach involves\ncompressing or decomposing the original tensor. One popular tensor\ndecomposition algorithm is the Tucker decomposition. However, existing\nstate-of-the-art algorithms for large-scale Tucker decomposition typically\nrelax the original optimization problem into multiple convex optimization\nproblems to ensure polynomial convergence. Unfortunately, these algorithms tend\nto converge slowly. In contrast, tensor decomposition exhibits a simple\noptimization landscape, making local search algorithms capable of converging to\na global (approximate) optimum much faster. In this paper, we propose the\nFastTuckerPlus algorithm, which decomposes the original optimization problem\ninto two non-convex optimization problems and solves them alternately using the\nStochastic Gradient Descent method. Furthermore, we introduce cuFastTuckerPlus,\na fine-grained parallel algorithm designed for GPU platforms, leveraging the\nperformance of tensor cores. This algorithm minimizes memory access overhead\nand computational costs, surpassing the state-of-the-art algorithms. Our\nexperimental results demonstrate that our method achieves a speedup of $3X$ to\n$5X$ compared to state-of-the-art algorithms.\n","authors":["Zixuan Li","Mingxing Duan","Huizhang Luo","Wangdong Yang","Kenli Li","Keqin Li"],"pdf_url":"https://arxiv.org/pdf/2404.10087v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14647v1","updated":"2024-05-23T14:48:23Z","published":"2024-05-23T14:48:23Z","title":"The integration of heterogeneous resources in the CMS Submission\n Infrastructure for the LHC Run 3 and beyond","summary":" While the computing landscape supporting LHC experiments is currently\ndominated by x86 processors at WLCG sites, this configuration will evolve in\nthe coming years. LHC collaborations will be increasingly employing HPC and\nCloud facilities to process the vast amounts of data expected during the LHC\nRun 3 and the future HL-LHC phase. These facilities often feature diverse\ncompute resources, including alternative CPU architectures like ARM and IBM\nPower, as well as a variety of GPU specifications. Using these heterogeneous\nresources efficiently is thus essential for the LHC collaborations reaching\ntheir future scientific goals. The Submission Infrastructure (SI) is a central\nelement in CMS Computing, enabling resource acquisition and exploitation by CMS\ndata processing, simulation and analysis tasks. The SI must therefore be\nadapted to ensure access and optimal utilization of this heterogeneous compute\ncapacity. Some steps in this evolution have been already taken, as CMS is\ncurrently using opportunistically a small pool of GPU slots provided mainly at\nthe CMS WLCG sites. Additionally, Power9 processors have been validated for CMS\nproduction at the Marconi-100 cluster at CINECA. This note will describe the\nupdated capabilities of the SI to continue ensuring the efficient allocation\nand use of computing resources by CMS, despite their increasing diversity. The\nnext steps towards a full integration and support of heterogeneous resources\naccording to CMS needs will also be reported.\n","authors":["Antonio Perez-Calero Yzquierdo","Marco Mascheroni","Edita Kizinevic","Farrukh Aftab Khan","Hyunwoo Kim","Maria Acosta Flechas","Nikos Tsipinakis","Saqib Haleem"],"pdf_url":"https://arxiv.org/pdf/2405.14647v1.pdf","comment":"26TH INTERNATIONAL CONFERENCE ON COMPUTING IN HIGH ENERGY & NUCLEAR\n PHYSICS - 2023"},{"id":"http://arxiv.org/abs/2405.14644v1","updated":"2024-05-23T14:46:10Z","published":"2024-05-23T14:46:10Z","title":"Adoption of a token-based authentication model for the CMS Submission\n Infrastructure","summary":" The CMS Submission Infrastructure (SI) is the main computing resource\nprovisioning system for CMS workloads. A number of HTCondor pools are employed\nto manage this infrastructure, which aggregates geographically distributed\nresources from the WLCG and other providers. Historically, the model of\nauthentication among the diverse components of this infrastructure has relied\non the Grid Security Infrastructure (GSI), based on identities and X509\ncertificates. In contrast, commonly used modern authentication standards are\nbased on capabilities and tokens. The WLCG has identified this trend and aims\nat a transparent replacement of GSI for all its workload management, data\ntransfer and storage access operations, to be completed during the current LHC\nRun 3. As part of this effort, and within the context of CMS computing, the\nSubmission Infrastructure group is in the process of phasing out the GSI part\nof its authentication layers, in favor of IDTokens and Scitokens. The use of\ntokens is already well integrated into the HTCondor Software Suite, which has\nallowed us to fully migrate the authentication between internal components of\nSI. Additionally, recent versions of the HTCondor-CE support tokens as well,\nenabling CMS resource requests to Grid sites employing this CE technology to be\ngranted by means of token exchange. After a rollout campaign to sites,\nsuccessfully completed by the third quarter of 2022, the totality of HTCondor\nCEs in use by CMS are already receiving Scitoken-based pilot jobs. On the ARC\nCE side, a parallel campaign was launched to foster the adoption of the REST\ninterface at CMS sites (required to enable token-based job submission via\nHTCondor-G), which is nearing completion as well. In this contribution, the\nnewly adopted authentication model will be described. We will then report on\nthe migration status and final steps towards complete GSI phase out in the CMS\nSI.\n","authors":["Antonio Perez-Calero Yzquierdo","Marco Mascheroni","Edita Kizinevic","Farrukh Aftab Khan","Hyunwoo Kim","Maria Acosta Flechas","Nikos Tsipinakis","Saqib Haleem","Frank Wurthwein"],"pdf_url":"https://arxiv.org/pdf/2405.14644v1.pdf","comment":"26TH INTERNATIONAL CONFERENCE ON COMPUTING IN HIGH ENERGY & NUCLEAR\n PHYSICS - 2023"},{"id":"http://arxiv.org/abs/2405.14642v1","updated":"2024-05-23T14:44:49Z","published":"2024-05-23T14:44:49Z","title":"GPU Implementations for Midsize Integer Addition and Multiplication","summary":" This paper explores practical aspects of using a high-level functional\nlanguage for GPU-based arithmetic on ``midsize'' integers. By this we mean\nintegers of up to about a quarter million bits, which is sufficient for most\npractical purposes. The goal is to understand whether it is possible to support\nefficient nested-parallel programs with a small, flexible code base. We report\non GPU implementations for addition and multiplication of integers that fit in\none CUDA block, thus leveraging temporal reuse from scratchpad memories. Our\nkey contribution resides in the simplicity of the proposed solutions: We\nrecognize that addition is a straightforward application of scan, which is\nknown to allow efficient GPU implementation. For quadratic multiplication we\nemploy a simple work-partitioning strategy that offers good temporal locality.\nFor FFT multiplication, we efficiently map the computation in the domain of\nintegral fields by finding ``good'' primes that enable almost-full utilization\nof machine words. In comparison, related work uses complex tiling strategies --\nwhich feel too big a hammer for the job -- or uses the computational domain of\nreals, which may degrade the magnitude of the base in which the computation is\ncarried. We evaluate the performance in comparison to the state-of-the-art CGBN\nlibrary, authored by NvidiaLab, and report that our CUDA prototype outperforms\nCGBN for integer sizes higher than 32K bits, while offering comparable\nperformance for smaller sizes. Moreover, we are, to our knowledge, the first to\nreport that FFT multiplication outperforms the classical one on the larger\nsizes that still fit in a CUDA block. Finally, we examine Futhark's strengths\nand weaknesses for efficiently supporting such computations and find out that a\ncompiler pass aimed at efficient sequentialization of excess parallelism would\nsignificantly improve performance.\n","authors":["Cosmin E. Oancea","Stephen M. Watt"],"pdf_url":"https://arxiv.org/pdf/2405.14642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14639v1","updated":"2024-05-23T14:42:37Z","published":"2024-05-23T14:42:37Z","title":"Repurposing of the Run 2 CMS High Level Trigger Infrastructure as a\n Cloud Resource for Offline Computing","summary":" The former CMS Run 2 High Level Trigger (HLT) farm is one of the largest\ncontributors to CMS compute resources, providing about 25k job slots for\noffline computing. This CPU farm was initially employed as an opportunistic\nresource, exploited during inter-fill periods, in the LHC Run 2. Since then, it\nhas become a nearly transparent extension of the CMS capacity at CERN, being\nlocated on-site at the LHC interaction point 5 (P5), where the CMS detector is\ninstalled. This resource has been configured to support the execution of\ncritical CMS tasks, such as prompt detector data reconstruction. It can\ntherefore be used in combination with the dedicated Tier 0 capacity at CERN, in\norder to process and absorb peaks in the stream of data coming from the CMS\ndetector. The initial configuration for this resource, based on statically\nconfigured VMs, provided the required level of functionality. However, regular\noperations of this cluster revealed certain limitations compared to the\nresource provisioning and use model employed in the case of WLCG sites. A new\nconfiguration, based on a vacuum-like model, has been implemented for this\nresource in order to solve the detected shortcomings. This paper reports about\nthis redeployment work on the permanent cloud for an enhanced support to CMS\noffline computing, comparing the former and new models' respective\nfunctionalities, along with the commissioning effort for the new setup.\n","authors":["Marco Mascheroni","Antonio Perez-Calero Yzquierdo","Edita Kizinevic","Farrukh Aftab Khan","Hyunwoo Kim","Maria Acosta Flechas","Nikos Tsipinakis","Saqib Haleem","Damiele Spiga","Christoph Wissing","Frank Wurthwein"],"pdf_url":"https://arxiv.org/pdf/2405.14639v1.pdf","comment":"26TH INTERNATIONAL CONFERENCE ON COMPUTING IN HIGH ENERGY & NUCLEAR\n PHYSICS - 2023"},{"id":"http://arxiv.org/abs/2405.14636v1","updated":"2024-05-23T14:41:22Z","published":"2024-05-23T14:41:22Z","title":"PerLLM: Personalized Inference Scheduling with Edge-Cloud Collaboration\n for Diverse LLM Services","summary":" With the rapid growth in the number of large language model (LLM) users, it\nis difficult for bandwidth-constrained cloud servers to simultaneously process\nmassive LLM services in real-time. Recently, edge-cloud infrastructures have\nbeen used to improve the processing efficiency of large-scale LLM services.\nHowever, the diversity of task requirements and the dynamics of resources pose\ngreat challenges to inference scheduling, leading to the wastage of many\nresources. In this paper, we present PerLLM, a personalized inference\nscheduling framework with edge-cloud collaboration designed for diverse LLM\nservices. For the complexity of multiple constraints and the decision-making\nprocess of edge-cloud collaboration, we integrate the upper confidence bound\nalgorithm based on the constraint satisfaction mechanism in PerLLM. For diverse\nLLM services, PerLLM can optimize service scheduling and resource allocation\nsolutions within the edge-cloud infrastructure to meet processing time\nrequirements while minimizing energy costs. Experimental results from different\nmodel deployments show that PerLLM can effectively meet the processing time\nrequirements of personalized services. Compared to other methods, PerLLM\nachieves 2.2x, 2.1x, and 1.6x throughput and reduces the energy cost by more\nthan 50%.\n","authors":["Zheming Yang","Yuanhao Yang","Chang Zhao","Qi Guo","Wenkai He","Wen Ji"],"pdf_url":"https://arxiv.org/pdf/2405.14636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14631v1","updated":"2024-05-23T14:36:59Z","published":"2024-05-23T14:36:59Z","title":"HPC resources for CMS offline computing: An integration and scalability\n challenge for the Submission Infrastructure","summary":" The computing resource needs of LHC experiments are expected to continue\ngrowing significantly during the Run 3 and into the HL-LHC era. The landscape\nof available resources will also evolve, as High Performance Computing (HPC)\nand Cloud resources will provide a comparable, or even dominant, fraction of\nthe total compute capacity. The future years present a challenge for the\nexperiments' resource provisioning models, both in terms of scalability and\nincreasing complexity. The CMS Submission Infrastructure (SI) provisions\ncomputing resources for CMS workflows. This infrastructure is built on a set of\nfederated HTCondor pools, currently aggregating 400k CPU cores distributed\nworldwide and supporting the simultaneous execution of over 200k computing\ntasks. Incorporating HPC resources into CMS computing represents firstly an\nintegration challenge, as HPC centers are much more diverse compared to Grid\nsites. Secondly, evolving the present SI, dimensioned to harness the current\nCMS computing capacity, to reach the resource scales required for the HLLHC\nphase, while maintaining global flexibility and efficiency, will represent an\nadditional challenge for the SI. To preventively address future potential\nscalability limits, the SI team regularly runs tests to explore the maximum\nreach of our infrastructure. In this note, the integration of HPC resources\ninto CMS offline computing is summarized, the potential concerns for the SI\nderived from the increased scale of operations are described, and the most\nrecent results of scalability test on the CMS SI are reported.\n","authors":["Antonio Perez-Calero Yzquierdo","Marco Mascheroni","Edita Kizinevic","Farrukh Aftab Khan","Hyunwoo Kim","Maria Acosta Flechas","Nikos Tsipinakis","Saqib Haleem"],"pdf_url":"https://arxiv.org/pdf/2405.14631v1.pdf","comment":"26TH INTERNATIONAL CONFERENCE ON COMPUTING IN HIGH ENERGY & NUCLEAR\n PHYSICS - 2023"},{"id":"http://arxiv.org/abs/2402.18386v2","updated":"2024-05-23T14:27:53Z","published":"2024-02-28T15:06:29Z","title":"TrustRate: A Decentralized Platform for Hijack-Resistant Anonymous\n Reviews","summary":" Reviews and ratings by users form a central component in several widely used\nproducts today (e.g., product reviews, ratings of online content, etc.), but\ntoday's platforms for managing such reviews are ad-hoc and vulnerable to\nvarious forms of tampering and hijack by fake reviews either by bots or\nmotivated paid workers. We define a new metric called 'hijack-resistance' for\nsuch review platforms, and then present TrustRate, an end-to-end decentralized,\nhijack-resistant platform for authentic, anonymous, tamper-proof reviews. With\na prototype implementation and evaluation at the scale of thousands of nodes,\nwe demonstrate the efficacy and performance of our platform, towards a new\nparadigm for building products based on trusted reviews by end users without\nhaving to trust a single organization that manages the reviews.\n","authors":["Rohit Dwivedula","Sriram Sridhar","Sambhav Satija","Muthian Sivathanu","Nishanth Chandran","Divya Gupta","Satya Lokam"],"pdf_url":"https://arxiv.org/pdf/2402.18386v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2405.14502v1","updated":"2024-05-23T12:35:08Z","published":"2024-05-23T12:35:08Z","title":"DEX: Scalable Range Indexing on Disaggregated Memory [Extended Version]","summary":" Memory disaggregation can potentially allow memory-optimized range indexes\nsuch as B+-trees to scale beyond one machine while attaining high hardware\nutilization and low cost. Designing scalable indexes on disaggregated memory,\nhowever, is challenging due to rudimentary caching, unprincipled offloading and\nexcessive inconsistency among servers.\n This paper proposes DEX, a new scalable B+-tree for memory disaggregation.\nDEX includes a set of techniques to reduce remote accesses, including logical\npartitioning, lightweight caching and cost-aware offloading. Our evaluation\nshows that DEX can outperform the state-of-the-art by 1.7--56.3X, and the\nadvantage remains under various setups, such as cache size and skewness.\n","authors":["Baotong Lu","Kaisong Huang","Chieh-Jan Mike Liang","Tianzheng Wang","Eric Lo"],"pdf_url":"https://arxiv.org/pdf/2405.14502v1.pdf","comment":"16 pages; To appear at VLDB 2024"},{"id":"http://arxiv.org/abs/2405.14446v1","updated":"2024-05-23T11:25:19Z","published":"2024-05-23T11:25:19Z","title":"Worldwide Federated Training of Language Models","summary":" The reliance of language model training on massive amounts of computation and\nvast datasets scraped from potentially low-quality, copyrighted, or sensitive\ndata has come into question practically, legally, and ethically. Federated\nlearning provides a plausible alternative by enabling previously untapped data\nto be voluntarily gathered from collaborating organizations. However, when\nscaled globally, federated learning requires collaboration across heterogeneous\nlegal, security, and privacy regimes while accounting for the inherent locality\nof language data; this further exacerbates the established challenge of\nfederated statistical heterogeneity. We propose a Worldwide Federated Language\nModel Training~(WorldLM) system based on federations of federations, where each\nfederation has the autonomy to account for factors such as its industry,\noperating jurisdiction, or competitive environment. WorldLM enables such\nautonomy in the presence of statistical heterogeneity via partial model\nlocalization by allowing sub-federations to attentively aggregate key layers\nfrom their constituents. Furthermore, it can adaptively share information\nacross federations via residual layer embeddings. Evaluations of language\nmodeling on naturally heterogeneous datasets show that WorldLM outperforms\nstandard federations by up to $1.91\\times$, approaches the personalized\nperformance of fully local models, and maintains these advantages under\nprivacy-enhancing techniques.\n","authors":["Alex Iacob","Lorenzo Sani","Bill Marino","Preslav Aleksandrov","Nicholas Donald Lane"],"pdf_url":"https://arxiv.org/pdf/2405.14446v1.pdf","comment":"19 pages, 8 figures, Under Review"},{"id":"http://arxiv.org/abs/2405.14413v1","updated":"2024-05-23T10:34:21Z","published":"2024-05-23T10:34:21Z","title":"GeoFaaS: An Edge-to-Cloud FaaS Platform","summary":" The massive growth of mobile and IoT devices demands geographically\ndistributed computing systems for optimal performance, privacy, and\nscalability. However, existing edge-to-cloud serverless platforms lack location\nawareness, resulting in inefficient network usage and increased latency.\n In this paper, we propose GeoFaaS, a novel edge-to-cloud\nFunction-as-a-Service (FaaS) platform that leverages real-time client location\ninformation for transparent request execution on the nearest available FaaS\nnode. If needed, GeoFaaS transparently offloads requests to the cloud when edge\nresources are overloaded, thus, ensuring consistent execution without user\nintervention. GeoFaaS has a modular and decentralized architecture: building on\nthe single-node FaaS system tinyFaaS, GeoFaaS works as a stand-alone\nedge-to-cloud FaaS platform but can also integrate and act as a routing layer\nfor existing FaaS services, e.g., in the cloud. To evaluate our approach, we\nimplemented an open-source proof-of-concept prototype and studied performance\nand fault-tolerance behavior in experiments.\n","authors":["Mohammadreza Malekabbasi","Tobias Pfandzelter","Trever Schirmer","David Bermbach"],"pdf_url":"https://arxiv.org/pdf/2405.14413v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.14371v1","updated":"2024-05-23T09:46:22Z","published":"2024-05-23T09:46:22Z","title":"EdgeShard: Efficient LLM Inference via Collaborative Edge Computing","summary":" Large language models (LLMs) have shown great potential in natural language\nprocessing and content generation. However, current LLMs heavily rely on cloud\ncomputing, leading to prolonged latency, high bandwidth cost, and privacy\nconcerns. Edge computing is promising to address such concerns by deploying\nLLMs on edge devices, closer to data sources. Some works try to leverage model\nquantization to reduce the model size to fit the resource-constraint edge\ndevices, but they lead to accuracy loss. Other works use cloud-edge\ncollaboration, suffering from unstable network connections. In this work, we\nleverage collaborative edge computing to facilitate the collaboration among\nedge devices and cloud servers for jointly performing efficient LLM inference.\nWe propose a general framework to partition the LLM model into shards and\ndeploy on distributed devices. To achieve efficient LLM inference, we formulate\nan adaptive joint device selection and model partition problem and design an\nefficient dynamic programming algorithm to optimize the inference latency and\nthroughput, respectively. Experiments of Llama2 serial models on a\nheterogeneous physical prototype demonstrate that EdgeShard achieves up to 50%\nlatency reduction and 2x throughput improvement over baseline methods.\n","authors":["Mingjin Zhang","Jiannong Cao","Xiaoming Shen","Zeyang Cui"],"pdf_url":"https://arxiv.org/pdf/2405.14371v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2405.14291v1","updated":"2024-05-23T08:09:21Z","published":"2024-05-23T08:09:21Z","title":"Variational Bayes for Federated Continual Learning","summary":" Federated continual learning (FCL) has received increasing attention due to\nits potential in handling real-world streaming data, characterized by evolving\ndata distributions and varying client classes over time. The constraints of\nstorage limitations and privacy concerns confine local models to exclusively\naccess the present data within each learning cycle. Consequently, this\nrestriction induces performance degradation in model training on previous data,\ntermed \"catastrophic forgetting\". However, existing FCL approaches need to\nidentify or know changes in data distribution, which is difficult in the real\nworld. To release these limitations, this paper directs attention to a broader\ncontinuous framework. Within this framework, we introduce Federated Bayesian\nNeural Network (FedBNN), a versatile and efficacious framework employing a\nvariational Bayesian neural network across all clients. Our method continually\nintegrates knowledge from local and historical data distributions into a single\nmodel, adeptly learning from new data distributions while retaining performance\non historical distributions. We rigorously evaluate FedBNN's performance\nagainst prevalent methods in federated learning and continual learning using\nvarious metrics. Experimental analyses across diverse datasets demonstrate that\nFedBNN achieves state-of-the-art results in mitigating forgetting.\n","authors":["Dezhong Yao","Sanmu Li","Yutong Dai","Zhiqiang Xu","Shengshan Hu","Peilin Zhao","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2405.14291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01728v2","updated":"2024-05-23T08:01:25Z","published":"2024-01-03T13:07:07Z","title":"Ravnest: Decentralized Asynchronous Training on Heterogeneous Devices","summary":" Modern deep learning models, growing larger and more complex, have\ndemonstrated exceptional generalization and accuracy due to training on huge\ndatasets. This trend is expected to continue. However, the increasing size of\nthese models poses challenges in training, as traditional centralized methods\nare limited by memory constraints at such scales. This paper proposes an\nasynchronous decentralized training paradigm for large modern deep learning\nmodels that harnesses the compute power of regular heterogeneous PCs with\nlimited resources connected across the internet to achieve favourable\nperformance metrics. Ravnest facilitates decentralized training by efficiently\norganizing compute nodes into clusters with similar data transfer rates and\ncompute capabilities, without necessitating that each node hosts the entire\nmodel. These clusters engage in $\\textit{Zero-Bubble Asynchronous Model\nParallel}$ training, and a $\\textit{Parallel Multi-Ring All-Reduce}$ method is\nemployed to effectively execute global parameter averaging across all clusters.\nWe have framed our asynchronous SGD loss function as a block structured\noptimization problem with delayed updates and derived an optimal convergence\nrate of $O\\left(\\frac{1}{\\sqrt{K}}\\right)$. We further discuss linear speedup\nwith respect to the number of participating clusters and the bound on the\nstaleness parameter.\n","authors":["Anirudh Rajiv Menon","Unnikrishnan Menon","Kailash Ahirwar"],"pdf_url":"https://arxiv.org/pdf/2401.01728v2.pdf","comment":"29 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.03791v2","updated":"2024-05-23T07:11:37Z","published":"2024-02-06T08:14:56Z","title":"ZeroPP: Unleashing Exceptional Parallelism Efficiency through\n Tensor-Parallelism-Free Methodology","summary":" Large-scale models rely heavily on 3D parallelism for distributed training,\nwhich utilizes tensor parallelism (TP) as the intra-operator parallelism to\npartition model states across GPUs. However, TP introduces significant\ncommunication overheads and complexity in modifying single-GPU code. In this\npaper, we propose a TP-free distributed framework ZeroPP, which leverages the\nhybrid of scalable inter-operator pipeline parallelism and intra-operator fully\nsharded data parallelism to train models at scale, reducing memory consumption\nand enabling high training efficiency. Through extensive experimentation, we\ndemonstrate that ZeroPP achieves significant performance gains of up to 33%\ncompared to conventional 3D parallelism while maintaining comparable GPU memory\nconsumption.\n","authors":["Ding Tang","Lijuan Jiang","Jiecheng Zhou","Minxi Jin","Hengjie Li","Xingcheng Zhang","Zhilin Pei","Jidong Zhai"],"pdf_url":"https://arxiv.org/pdf/2402.03791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14105v1","updated":"2024-05-23T02:14:17Z","published":"2024-05-23T02:14:17Z","title":"Distributed Speculative Inference of Large Language Models","summary":" Accelerating the inference of large language models (LLMs) is an important\nchallenge in artificial intelligence. This paper introduces distributed\nspeculative inference (DSI), a novel distributed inference algorithm that is\nprovably faster than speculative inference (SI) [leviathan2023fast,\nchen2023accelerating, miao2023specinfer] and traditional autoregressive\ninference (non-SI). Like other SI algorithms, DSI works on frozen LLMs,\nrequiring no training or architectural modifications, and it preserves the\ntarget distribution.\n Prior studies on SI have demonstrated empirical speedups (compared to non-SI)\nbut require a fast and accurate drafter LLM. In practice, off-the-shelf LLMs\noften do not have matching drafters that are sufficiently fast and accurate. We\nshow a gap: SI gets slower than non-SI when using slower or less accurate\ndrafters. We close this gap by proving that DSI is faster than both SI and\nnon-SI given any drafters. By orchestrating multiple instances of the target\nand drafters, DSI is not only faster than SI but also supports LLMs that cannot\nbe accelerated with SI.\n Our simulations show speedups of off-the-shelf LLMs in realistic settings:\nDSI is 1.29-1.92x faster than SI.\n","authors":["Nadav Timor","Jonathan Mamou","Daniel Korat","Moshe Berchansky","Oren Pereg","Moshe Wasserblat","Tomer Galanti","Michal Gordon","David Harel"],"pdf_url":"https://arxiv.org/pdf/2405.14105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.02410v2","updated":"2024-05-23T14:23:46Z","published":"2022-06-06T08:03:53Z","title":"Load Balancing Using Sparse Communication","summary":" Load balancing across parallel servers is an important class of congestion\ncontrol problems that arises in service systems. An effective load balancer\nrelies heavily on accurate, real-time congestion information to make routing\ndecisions. However, obtaining such information can impose significant\ncommunication overheads, especially in demanding applications like those found\nin modern data centers.\n We introduce a framework for communication-aware load balancing and design\nnew load balancing algorithms that perform exceptionally well even in scenarios\nwith sparse communication patterns. Central to our approach is state\napproximation, where the load balancer first estimates server states through a\ncommunication protocol. Subsequently, it utilizes these approximate states\nwithin a load balancing algorithm to determine routing decisions.\n We demonstrate that by using a novel communication protocol, one can achieve\naccurate queue length approximation with sparse communication: for a maximal\napproximation error of x, the communication frequency only needs to be\nO(1/x^2). We further show, via a diffusion analysis, that a constant maximal\napproximation error is sufficient for achieving asymptotically optimal\nperformance. Taken together, these results therefore demonstrate that highly\nperformant load balancing is possible with very little communication. Through\nsimulations, we observe that the proposed designs match or surpass the\nperformance of state-of-the-art load balancing algorithms while drastically\nreducing communication rates by up to 90%.\n","authors":["Gal Mendelson","Xu Kuang"],"pdf_url":"https://arxiv.org/pdf/2206.02410v2.pdf","comment":null}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2402.08957v3","updated":"2024-05-23T03:13:23Z","published":"2024-02-14T05:57:58Z","title":"MUSTARD: Mastering Uniform Synthesis of Theorem and Proof Data","summary":" Recent large language models (LLMs) have witnessed significant advancement in\nvarious tasks, including mathematical reasoning and theorem proving. As these\ntwo tasks require strict and formal multi-step inference, they are appealing\ndomains for exploring the reasoning ability of LLMs but still face important\nchallenges. Previous studies such as Chain-of-Thought (CoT) have revealed the\neffectiveness of intermediate steps guidance. However, such step-wise\nannotation requires heavy labor, leading to insufficient training steps for\ncurrent benchmarks. To fill this gap, this work introduces MUSTARD, a data\ngeneration framework that masters uniform synthesis of theorem and proof data\nof high quality and diversity. MUSTARD synthesizes data in three stages: (1) It\nsamples a few mathematical concept seeds as the problem category. (2) Then, it\nprompts a generative language model with the sampled concepts to obtain both\nthe problems and their step-wise formal solutions. (3) Lastly, the framework\nutilizes a proof assistant (e.g., Lean Prover) to filter the valid proofs. With\nthe proposed MUSTARD, we present a theorem-and-proof benchmark MUSTARDSAUCE\nwith 5,866 valid data points. Each data point contains an informal statement,\nan informal proof, and a translated formal proof that passes the prover\nvalidation. We perform extensive analysis and demonstrate that MUSTARD\ngenerates validated high-quality step-by-step data. We further apply the\nMUSTARDSAUCE for fine-tuning smaller language models. The fine-tuned Llama 2-7B\nachieves a 15.41% average relative performance gain in automated theorem\nproving, and 8.18% in math word problems. Codes and data are available at\nhttps://github.com/Eleanor-H/MUSTARD.\n","authors":["Yinya Huang","Xiaohan Lin","Zhengying Liu","Qingxing Cao","Huajian Xin","Haiming Wang","Zhenguo Li","Linqi Song","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2402.08957v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14275v1","updated":"2024-05-23T07:52:54Z","published":"2024-05-23T07:52:54Z","title":"A Language-Theoretic Approach to the Heapability of Signed Permutations","summary":" We investigate a signed version of the Hammersley process, a discrete process\non words related to a property of integer sequences called heapability (Byers\net al., ANALCO 2011). The specific version that we investigate corresponds to a\nversion of this property for signed sequences.\n We give a characterization of the words that can appear as images the signed\nHammersley process. In particular we show that the language of such words is\nthe intersection of two deterministic one-counter languages.\n","authors":["Gabriel Istrate"],"pdf_url":"https://arxiv.org/pdf/2405.14275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14272v1","updated":"2024-05-23T07:50:46Z","published":"2024-05-23T07:50:46Z","title":"Nominal Tree Automata With Name Allocation","summary":" Data trees serve as an abstraction of structured data, such as XML documents.\nA number of specification formalisms for languages of data trees have been\ndeveloped, many of them adhering to the paradigm of register automata, which is\nbased on storing data values encountered on the tree in registers for\nsubsequent comparison with further data values. Already on word languages, the\nexpressiveness of such automata models typically increases with the power of\ncontrol (e.g. deterministic, non-deterministic, alternating). Language\ninclusion is typically undecidable for non-deterministic or alternating models\nunless the number of registers is radically restricted, and even then often\nremains non-elementary. We present an automaton model for data trees that\nretains a reasonable level of expressiveness, in particular allows\nnon-determinism and any number of registers, while admitting language inclusion\nchecking in elementary complexity, in fact in parametrized exponential time. We\nphrase the description of our automaton model in the language of nominal sets,\nbuilding on the recently introduced paradigm of explicit name allocation in\nnominal automata.\n","authors":["Simon Prucker","Lutz Schröder"],"pdf_url":"https://arxiv.org/pdf/2405.14272v1.pdf","comment":null}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2309.01945v5","updated":"2024-05-23T13:43:30Z","published":"2023-09-05T04:39:34Z","title":"On-Chip Hardware-Aware Quantization for Mixed Precision Neural Networks","summary":" Low-bit quantization emerges as one of the most promising compression\napproaches for deploying deep neural networks on edge devices. Mixed-precision\nquantization leverages a mixture of bit-widths to unleash the accuracy and\nefficiency potential of quantized models. However, existing mixed-precision\nquantization methods rely on simulations in high-performance devices to achieve\naccuracy and efficiency trade-offs in immense search spaces. This leads to a\nnon-negligible gap between the estimated efficiency metrics and the actual\nhardware that makes quantized models far away from the optimal accuracy and\nefficiency, and also causes the quantization process to rely on additional\nhigh-performance devices. In this paper, we propose an On-Chip Hardware-Aware\nQuantization (OHQ) framework, performing hardware-aware mixed-precision\nquantization on deployed edge devices to achieve accurate and efficient\ncomputing. Specifically, for efficiency metrics, we built an On-Chip\nQuantization Aware pipeline, which allows the quantization process to perceive\nthe actual hardware efficiency of the quantization operator and avoid\noptimization errors caused by inaccurate simulation. For accuracy metrics, we\npropose Mask-Guided Quantization Estimation technology to effectively estimate\nthe accuracy impact of operators in the on-chip scenario, getting rid of the\ndependence of the quantization process on high computing power. By synthesizing\ninsights from quantized models and hardware through linear optimization, we can\nobtain optimized bit-width configurations to achieve outstanding performance on\naccuracy and efficiency. We evaluate inference accuracy and acceleration with\nquantization for various architectures and compression ratios on hardware. OHQ\nachieves 70% and 73% accuracy for ResNet-18 and MobileNetV3, respectively, and\ncan reduce latency by 15~30% compared to INT8 on real deployment.\n","authors":["Wei Huang","Haotong Qin","Yangdong Liu","Jingzhuo Liang","Yulun Zhang","Ying Li","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.01945v5.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.13563v2","updated":"2024-05-23T11:34:45Z","published":"2024-03-20T12:56:40Z","title":"DL2Fence: Integrating Deep Learning and Frame Fusion for Enhanced\n Detection and Localization of Refined Denial-of-Service in Large-Scale NoCs","summary":" This study introduces a refined Flooding Injection Rate-adjustable\nDenial-of-Service (DoS) model for Network-on-Chips (NoCs) and more importantly\npresents DL2Fence, a novel framework utilizing Deep Learning (DL) and Frame\nFusion (2F) for DoS detection and localization. Two Convolutional Neural\nNetworks models for classification and segmentation were developed to detect\nand localize DoS respectively. It achieves detection and localization\naccuracies of 95.8% and 91.7%, and precision rates of 98.5% and 99.3% in a\n16x16 mesh NoC. The framework's hardware overhead notably decreases by 76.3%\nwhen scaling from 8x8 to 16x16 NoCs, and it requires 42.4% less hardware\ncompared to state-of-the-arts. This advancement demonstrates DL2Fence's\neffectiveness in balancing outstanding detection performance in large-scale\nNoCs with extremely low hardware overhead.\n","authors":["Haoyu Wang","Basel Halak","Jianjie Ren","Ahmad Atamli"],"pdf_url":"https://arxiv.org/pdf/2403.13563v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14209v1","updated":"2024-05-23T06:06:32Z","published":"2024-05-23T06:06:32Z","title":"Exploring and Evaluating Real-world CXL: Use Cases and System Adoption","summary":" Compute eXpress Link (CXL) is emerging as a promising memory interface\ntechnology. Because of the common unavailiability of CXL devices, the\nperformance of the CXL memory is largely unknown. What are the use cases for\nthe CXL memory? What are the impacts of the CXL memory on application\nperformance? How to use the CXL memory in combination with existing memory\ncomponents? In this work, we study the performance of three genuine CXL\nmemory-expansion cards from different vendors. We characterize the basic\nperformance of the CXL memory, study how HPC applications and large language\nmodels can benefit from the CXL memory, and study the interplay between memory\ntiering and page interleaving. We also propose a novel data object-level\ninterleaving policy to match the interleaving policy with memory access\npatterns. We reveal the challenges and opportunities of using the CXL memory.\n","authors":["Jie Liu","Xi Wang","Jianbo Wu","Shuangyan Yang","Jie Ren","Bhanu Shankar","Dong Li"],"pdf_url":"https://arxiv.org/pdf/2405.14209v1.pdf","comment":null}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2405.14642v1","updated":"2024-05-23T14:44:49Z","published":"2024-05-23T14:44:49Z","title":"GPU Implementations for Midsize Integer Addition and Multiplication","summary":" This paper explores practical aspects of using a high-level functional\nlanguage for GPU-based arithmetic on ``midsize'' integers. By this we mean\nintegers of up to about a quarter million bits, which is sufficient for most\npractical purposes. The goal is to understand whether it is possible to support\nefficient nested-parallel programs with a small, flexible code base. We report\non GPU implementations for addition and multiplication of integers that fit in\none CUDA block, thus leveraging temporal reuse from scratchpad memories. Our\nkey contribution resides in the simplicity of the proposed solutions: We\nrecognize that addition is a straightforward application of scan, which is\nknown to allow efficient GPU implementation. For quadratic multiplication we\nemploy a simple work-partitioning strategy that offers good temporal locality.\nFor FFT multiplication, we efficiently map the computation in the domain of\nintegral fields by finding ``good'' primes that enable almost-full utilization\nof machine words. In comparison, related work uses complex tiling strategies --\nwhich feel too big a hammer for the job -- or uses the computational domain of\nreals, which may degrade the magnitude of the base in which the computation is\ncarried. We evaluate the performance in comparison to the state-of-the-art CGBN\nlibrary, authored by NvidiaLab, and report that our CUDA prototype outperforms\nCGBN for integer sizes higher than 32K bits, while offering comparable\nperformance for smaller sizes. Moreover, we are, to our knowledge, the first to\nreport that FFT multiplication outperforms the classical one on the larger\nsizes that still fit in a CUDA block. Finally, we examine Futhark's strengths\nand weaknesses for efficiently supporting such computations and find out that a\ncompiler pass aimed at efficient sequentialization of excess parallelism would\nsignificantly improve performance.\n","authors":["Cosmin E. Oancea","Stephen M. Watt"],"pdf_url":"https://arxiv.org/pdf/2405.14642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08957v3","updated":"2024-05-23T03:13:23Z","published":"2024-02-14T05:57:58Z","title":"MUSTARD: Mastering Uniform Synthesis of Theorem and Proof Data","summary":" Recent large language models (LLMs) have witnessed significant advancement in\nvarious tasks, including mathematical reasoning and theorem proving. As these\ntwo tasks require strict and formal multi-step inference, they are appealing\ndomains for exploring the reasoning ability of LLMs but still face important\nchallenges. Previous studies such as Chain-of-Thought (CoT) have revealed the\neffectiveness of intermediate steps guidance. However, such step-wise\nannotation requires heavy labor, leading to insufficient training steps for\ncurrent benchmarks. To fill this gap, this work introduces MUSTARD, a data\ngeneration framework that masters uniform synthesis of theorem and proof data\nof high quality and diversity. MUSTARD synthesizes data in three stages: (1) It\nsamples a few mathematical concept seeds as the problem category. (2) Then, it\nprompts a generative language model with the sampled concepts to obtain both\nthe problems and their step-wise formal solutions. (3) Lastly, the framework\nutilizes a proof assistant (e.g., Lean Prover) to filter the valid proofs. With\nthe proposed MUSTARD, we present a theorem-and-proof benchmark MUSTARDSAUCE\nwith 5,866 valid data points. Each data point contains an informal statement,\nan informal proof, and a translated formal proof that passes the prover\nvalidation. We perform extensive analysis and demonstrate that MUSTARD\ngenerates validated high-quality step-by-step data. We further apply the\nMUSTARDSAUCE for fine-tuning smaller language models. The fine-tuned Llama 2-7B\nachieves a 15.41% average relative performance gain in automated theorem\nproving, and 8.18% in math word problems. Codes and data are available at\nhttps://github.com/Eleanor-H/MUSTARD.\n","authors":["Yinya Huang","Xiaohan Lin","Zhengying Liu","Qingxing Cao","Huajian Xin","Haiming Wang","Zhenguo Li","Linqi Song","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2402.08957v3.pdf","comment":null}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2405.14430v1","updated":"2024-05-23T11:00:07Z","published":"2024-05-23T11:00:07Z","title":"PipeFusion: Displaced Patch Pipeline Parallelism for Inference of\n Diffusion Transformer Models","summary":" This paper introduces PipeFusion, a novel approach that harnesses multi-GPU\nparallelism to address the high computational and latency challenges of\ngenerating high-resolution images with diffusion transformers (DiT) models.\nPipeFusion splits images into patches and distributes the network layers across\nmultiple devices. It employs a pipeline parallel manner to orchestrate\ncommunication and computations. By leveraging the high similarity between the\ninput from adjacent diffusion steps, PipeFusion eliminates the waiting time in\nthe pipeline by reusing the one-step stale feature maps to provide context for\nthe current step. Our experiments demonstrate that it can generate higher image\nresolution where existing DiT parallel approaches meet OOM. PipeFusion\nsignificantly reduces the required communication bandwidth, enabling DiT\ninference to be hosted on GPUs connected via PCIe rather than the more costly\nNVLink infrastructure, which substantially lowers the overall operational\nexpenses for serving DiT models. Our code is publicly available at\nhttps://github.com/PipeFusion/PipeFusion.\n","authors":["Jiannan Wang","Jiarui Fang","Aoyu Li","PengCheng Yang"],"pdf_url":"https://arxiv.org/pdf/2405.14430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14209v1","updated":"2024-05-23T06:06:32Z","published":"2024-05-23T06:06:32Z","title":"Exploring and Evaluating Real-world CXL: Use Cases and System Adoption","summary":" Compute eXpress Link (CXL) is emerging as a promising memory interface\ntechnology. Because of the common unavailiability of CXL devices, the\nperformance of the CXL memory is largely unknown. What are the use cases for\nthe CXL memory? What are the impacts of the CXL memory on application\nperformance? How to use the CXL memory in combination with existing memory\ncomponents? In this work, we study the performance of three genuine CXL\nmemory-expansion cards from different vendors. We characterize the basic\nperformance of the CXL memory, study how HPC applications and large language\nmodels can benefit from the CXL memory, and study the interplay between memory\ntiering and page interleaving. We also propose a novel data object-level\ninterleaving policy to match the interleaving policy with memory access\npatterns. We reveal the challenges and opportunities of using the CXL memory.\n","authors":["Jie Liu","Xi Wang","Jianbo Wu","Shuangyan Yang","Jie Ren","Bhanu Shankar","Dong Li"],"pdf_url":"https://arxiv.org/pdf/2405.14209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14185v1","updated":"2024-05-23T05:29:29Z","published":"2024-05-23T05:29:29Z","title":"A structure-aware framework for learning device placements on\n computation graphs","summary":" Existing approaches for device placement ignore the topological features of\ncomputation graphs and rely mostly on heuristic methods for graph partitioning.\nAt the same time, they either follow a grouper-placer or an encoder-placer\narchitecture, which requires understanding the interaction structure between\ncode operations. To bridge the gap between encoder-placer and grouper-placer\ntechniques, we propose a novel framework for the task of device placement,\nrelying on smaller computation graphs extracted from the OpenVINO toolkit using\nreinforcement learning. The framework consists of five steps, including graph\ncoarsening, node representation learning and policy optimization. It\nfacilitates end-to-end training and takes into consideration the directed and\nacyclic nature of the computation graphs. We also propose a model variant,\ninspired by graph parsing networks and complex network analysis, enabling graph\nrepresentation learning and personalized graph partitioning jointly, using an\nunspecified number of groups. To train the entire framework, we utilize\nreinforcement learning techniques by employing the execution time of the\nsuggested device placements to formulate the reward. We demonstrate the\nflexibility and effectiveness of our approach through multiple experiments with\nthree benchmark models, namely Inception-V3, ResNet, and BERT. The robustness\nof the proposed framework is also highlighted through an ablation study. The\nsuggested placements improve the inference speed for the benchmark models by up\nto $58.2\\%$ over CPU execution and by up to $60.24\\%$ compared to other\ncommonly used baselines.\n","authors":["Shukai Duan","Heng Ping","Nikos Kanakaris","Xiongye Xiao","Peiyu Zhang","Panagiotis Kyriakis","Nesreen K. Ahmed","Guixiang Ma","Mihai Capota","Shahin Nazarian","Theodore L. Willke","Paul Bogdan"],"pdf_url":"https://arxiv.org/pdf/2405.14185v1.pdf","comment":null}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2405.14835v1","updated":"2024-05-23T17:50:34Z","published":"2024-05-23T17:50:34Z","title":"Polynomial Pass Semi-Streaming Lower Bounds for K-Cores and Degeneracy","summary":" The following question arises naturally in the study of graph streaming\nalgorithms:\n \"Is there any graph problem which is \"not too hard\", in that it can be solved\nefficiently with total communication (nearly) linear in the number $n$ of\nvertices, and for which, nonetheless, any streaming algorithm with\n$\\tilde{O}(n)$ space (i.e., a semi-streaming algorithm) needs a polynomial\n$n^{\\Omega(1)}$ number of passes?\"\n Assadi, Chen, and Khanna [STOC 2019] were the first to prove that this is\nindeed the case. However, the lower bounds that they obtained are for rather\nnon-standard graph problems.\n Our first main contribution is to present the first polynomial-pass lower\nbounds for natural \"not too hard\" graph problems studied previously in the\nstreaming model: $k$-cores and degeneracy. We devise a novel communication\nprotocol for both problems with near-linear communication, thus showing that\n$k$-cores and degeneracy are natural examples of \"not too hard\" problems.\nIndeed, previous work have developed single-pass semi-streaming algorithms for\napproximating these problems. In contrast, we prove that any semi-streaming\nalgorithm for exactly solving these problems requires (almost)\n$\\Omega(n^{1/3})$ passes.\n Our second main contribution is improved round-communication lower bounds for\nthe underlying communication problems at the basis of these reductions:\n * We improve the previous lower bound of Assadi, Chen, and Khanna for hidden\npointer chasing (HPC) to achieve optimal bounds.\n * We observe that all current reductions from HPC can also work with a\ngeneralized version of this problem that we call MultiHPC, and prove an even\nstronger and optimal lower bound for this generalization.\n These two results collectively allow us to improve the resulting pass lower\nbounds for semi-streaming algorithms by a polynomial factor, namely, from\n$n^{1/5}$ to $n^{1/3}$ passes.\n","authors":["Sepehr Assadi","Prantar Ghosh","Bruno Loff","Parth Mittal","Sagnik Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2405.14835v1.pdf","comment":"Accepted at CCC 2024"},{"id":"http://arxiv.org/abs/2402.12875v3","updated":"2024-05-23T17:10:39Z","published":"2024-02-20T10:11:03Z","title":"Chain of Thought Empowers Transformers to Solve Inherently Serial\n Problems","summary":" Instructing the model to generate a sequence of intermediate steps, a.k.a., a\nchain of thought (CoT), is a highly effective method to improve the accuracy of\nlarge language models (LLMs) on arithmetics and symbolic reasoning tasks.\nHowever, the mechanism behind CoT remains unclear. This work provides a\ntheoretical understanding of the power of CoT for decoder-only transformers\nthrough the lens of expressiveness. Conceptually, CoT empowers the model with\nthe ability to perform inherently serial computation, which is otherwise\nlacking in transformers, especially when depth is low. Given input length $n$,\nprevious works have shown that constant-depth transformers with finite\nprecision $\\mathsf{poly}(n)$ embedding size can only solve problems in\n$\\mathsf{TC}^0$ without CoT. We first show an even tighter expressiveness upper\nbound for constant-depth transformers with constant-bit precision, which can\nonly solve problems in $\\mathsf{AC}^0$, a proper subset of $ \\mathsf{TC}^0$.\nHowever, with $T$ steps of CoT, constant-depth transformers using constant-bit\nprecision and $O(\\log n)$ embedding size can solve any problem solvable by\nboolean circuits of size $T$. Empirically, enabling CoT dramatically improves\nthe accuracy for tasks that are hard for parallel computation, including the\ncomposition of permutation groups, iterated squaring, and circuit value\nproblems, especially for low-depth transformers.\n","authors":["Zhiyuan Li","Hong Liu","Denny Zhou","Tengyu Ma"],"pdf_url":"https://arxiv.org/pdf/2402.12875v3.pdf","comment":"38 pages, 10 figures. Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.06740v2","updated":"2024-05-23T14:53:58Z","published":"2024-02-09T19:16:56Z","title":"Nearest Neighbor Complexity and Boolean Circuits","summary":" A nearest neighbor representation of a Boolean function $f$ is a set of\nvectors (anchors) labeled by $0$ or $1$ such that $f(\\vec{x}) = 1$ if and only\nif the closest anchor to $\\vec{x}$ is labeled by $1$. This model was introduced\nby Hajnal, Liu, and Tur\\'an (2022), who studied bounds on the number of anchors\nrequired to represent Boolean functions under different choices of anchors\n(real vs. Boolean vectors) as well as the more expressive model of $k$-nearest\nneighbors.\n We initiate the study of the representational power of nearest and\n$k$-nearest neighbors through Boolean circuit complexity. To this end, we\nestablish a connection between Boolean functions with polynomial nearest\nneighbor complexity and those that can be efficiently represented by classes\nbased on linear inequalities -- min-plus polynomial threshold functions --\npreviously studied in relation to threshold circuits. This extends an\nobservation of Hajnal et al. (2022). We obtain exponential lower bounds on the\n$k$-nearest neighbors complexity of explicit $n$-variate functions, assuming $k\n\\leq n^{1-\\epsilon}$. Previously, no superlinear lower bound was known for any\n$k>1$.\n Next, we further extend the connection between nearest neighbor\nrepresentations and circuits to the $k$-nearest neighbors case. As a result, we\nshow that proving superpolynomial lower bounds for the $k$-nearest neighbors\ncomplexity of an explicit function for arbitrary $k$ would require a\nbreakthrough in circuit complexity. In addition, we prove an exponential\nseparation between the nearest neighbor and $k$-nearest neighbors complexity\n(for unrestricted $k$) of an explicit function. These results address questions\nraised by Hajnal et al. (2022) of proving strong lower bounds for $k$-nearest\nneighbors and understanding the role of the parameter $k$. Finally, we devise\nnew bounds on the nearest neighbor complexity for several explicit functions.\n","authors":["Mason DiCicco","Vladimir Podolskii","Daniel Reichman"],"pdf_url":"https://arxiv.org/pdf/2402.06740v2.pdf","comment":"Minor corrections"},{"id":"http://arxiv.org/abs/2311.07454v4","updated":"2024-05-23T11:55:55Z","published":"2023-11-13T16:35:34Z","title":"Causal Discovery under Latent Class Confounding","summary":" An acyclic causal structure can be described using a directed acyclic graph\n(DAG) with arrows indicating causation. The task of learning this structure\nfrom data is known as \"causal discovery.\" Diverse populations or changing\nenvironments can sometimes give rise to heterogeneous data. This heterogeneity\ncan be thought of as a mixture model with multiple \"sources,\" each exerting\ntheir own distinct signature on the observed variables. From this perspective,\nthe source is a latent common cause for every observed variable. While some\nmethods for causal discovery are able to work around unobserved confounding in\nspecial cases, the only known ways to deal with a global confounder (such as a\nlatent class) involve parametric assumptions. Focusing on discrete observables,\nwe demonstrate that globally confounded causal structures can still be\nidentifiable without parametric assumptions, so long as the number of latent\nclasses remains small relative to the size and sparsity of the underlying DAG.\n","authors":["Bijan Mazaheri","Spencer Gordon","Yuval Rabani","Leonard Schulman"],"pdf_url":"https://arxiv.org/pdf/2311.07454v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05079v2","updated":"2024-05-23T10:51:36Z","published":"2023-08-09T17:16:19Z","title":"Space-bounded quantum state testing via space-efficient quantum singular\n value transformation","summary":" Driven by exploring the power of quantum computation with a limited number of\nqubits, we present a novel complete characterization for space-bounded quantum\ncomputation, which encompasses settings with one-sided error (unitary coRQL)\nand two-sided error (BQL), approached from a quantum state testing perspective:\n - The first family of natural complete problems for unitary coRQL, i.e.,\nspace-bounded quantum state certification for trace distance and\nHilbert-Schmidt distance;\n - A new family of natural complete problems for BQL, i.e., space-bounded\nquantum state testing for trace distance, Hilbert-Schmidt distance, and quantum\nentropy difference.\n In the space-bounded quantum state testing problem, we consider two\nlogarithmic-qubit quantum circuits (devices) denoted as $Q_0$ and $Q_1$, which\nprepare quantum states $\\rho_0$ and $\\rho_1$, respectively, with access to\ntheir ``source code''. Our goal is to decide whether $\\rho_0$ is\n$\\epsilon_1$-close to or $\\epsilon_2$-far from $\\rho_1$ with respect to a\nspecified distance-like measure. Interestingly, unlike time-bounded state\ntesting problems, our results reveal that the space-bounded state testing\nproblems all correspond to the same class. Moreover, our algorithms on the\ntrace distance inspire an algorithmic Holevo-Helstrom measurement, implying\nQSZK is in QIP(2) with a quantum linear-space honest prover.\n Our results primarily build upon a space-efficient variant of the quantum\nsingular value transformation (QSVT) introduced by Gily\\'en, Su, Low, and Wiebe\n(STOC 2019), which is of independent interest. Our technique provides a unified\napproach for designing space-bounded quantum algorithms. Specifically, we show\nthat implementing QSVT for any bounded polynomial that approximates a\npiecewise-smooth function incurs only a constant overhead in terms of the space\nrequired for special forms of the projected unitary encoding.\n","authors":["François Le Gall","Yupan Liu","Qisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05079v2.pdf","comment":"71 pages, 3 figures. v2: improved error and norm bounds in\n space-efficient polynomial approximation (Section 3.1), clarified the\n application scope of the robust oblivious amplitude amplification in Theorem\n 3.10, and added new results on algorithmic Holevo-Helstrom measurement and a\n slightly improved upper bound for QSZK (Section 5)"},{"id":"http://arxiv.org/abs/2403.09134v3","updated":"2024-05-23T09:21:44Z","published":"2024-03-14T07:05:10Z","title":"Local Enumeration and Majority Lower Bounds","summary":" Depth-3 circuit lower bounds and $k$-SAT algorithms are intimately related;\nthe state-of-the-art $\\Sigma^k_3$-circuit lower bound and the $k$-SAT algorithm\nare based on the same combinatorial theorem. In this paper we define a problem\nwhich reveals new interactions between the two. Define Enum($k$, $t$) problem\nas: given an $n$-variable $k$-CNF and an initial assignment $\\alpha$, output\nall satisfying assignments at Hamming distance $t$ from $\\alpha$, assuming that\nthere are no satisfying assignments of Hamming distance less than $t$ from\n$\\alpha$. Observe that: an upper bound $b(n, k, t)$ on the complexity of\nEnum($k$, $t$) implies:\n - Depth-3 circuits: Any $\\Sigma^k_3$ circuit computing the Majority function\nhas size at least $\\binom{n}{\\frac{n}{2}}/b(n, k, \\frac{n}{2})$.\n - $k$-SAT: There exists an algorithm solving $k$-SAT in time $O(\\sum_{t =\n1}^{n/2}b(n, k, t))$.\n A simple construction shows that $b(n, k, \\frac{n}{2}) \\ge 2^{(1 -\nO(\\log(k)/k))n}$. Thus, matching upper bounds would imply a\n$\\Sigma^k_3$-circuit lower bound of $2^{\\Omega(\\log(k)n/k)}$ and a $k$-SAT\nupper bound of $2^{(1 - \\Omega(\\log(k)/k))n}$. The former yields an\nunrestricted depth-3 lower bound of $2^{\\omega(\\sqrt{n})}$ solving a long\nstanding open problem, and the latter breaks the Super Strong Exponential Time\nHypothesis.\n In this paper, we propose a randomized algorithm for Enum($k$, $t$) and\nintroduce new ideas to analyze it. We demonstrate the power of our ideas by\nconsidering the first non-trivial instance of the problem, i.e., Enum($3$,\n$\\frac{n}{2}$). We show that the expected running time of our algorithm is\n$1.598^n$, substantially improving on the trivial bound of $3^{n/2} \\simeq\n1.732^n$. This already improves $\\Sigma^3_3$ lower bounds for Majority function\nto $1.251^n$. The previous bound was $1.154^n$ which follows from the work of\nH{\\aa}stad, Jukna, and Pudl\\'ak (Comput. Complex.'95).\n","authors":["Mohit Gurumukhani","Ramamohan Paturi","Pavel Pudlák","Michael Saks","Navid Talebanfard"],"pdf_url":"https://arxiv.org/pdf/2403.09134v3.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2405.14678v1","updated":"2024-05-23T15:13:33Z","published":"2024-05-23T15:13:33Z","title":"Measuring data types","summary":" In this article, we combine Sweedler's classic theory of measuring coalgebras\n-- by which $k$-algebras are enriched in $k$-coalgebras for $k$ a field -- with\nthe theory of W-types -- by which the categorical semantics of inductive data\ntypes in functional programming languages are understood. In our main theorem,\nwe find that under some hypotheses, algebras of an endofunctor are enriched in\ncoalgebras of the same endofunctor, and we find polynomial endofunctors provide\nmany interesting examples of this phenomenon. We then generalize the notion of\ninitial algebra of an endofunctor using this enrichment, thus generalizing the\nnotion of W-type. This article is an extended version of arXiv:2303.16793, it\nadds expository introductions to the original theories of measuring coalgebras\nand W-types along with some improvements to the main theory and many explicitly\nworked examples.\n","authors":["Lukas Mulder","Paige Randall North","Maximilien Péroux"],"pdf_url":"https://arxiv.org/pdf/2405.14678v1.pdf","comment":"67 pages"},{"id":"http://arxiv.org/abs/2403.03880v2","updated":"2024-05-23T15:03:45Z","published":"2024-03-06T17:40:26Z","title":"Almost Surely Asymptotically Constant Graph Neural Networks","summary":" We present a new angle on the expressive power of graph neural networks\n(GNNs) by studying how the predictions of a GNN probabilistic classifier evolve\nas we apply it on larger graphs drawn from some random graph model. We show\nthat the output converges to a constant function, which upper-bounds what these\nclassifiers can uniformly express. This strong convergence phenomenon applies\nto a very wide class of GNNs, including state of the art models, with\naggregates including mean and the attention-based mechanism of graph\ntransformers. Our results apply to a broad class of random graph models,\nincluding sparse and dense variants of the Erd\\H{o}s-R\\'enyi model, the\nstochastic block model, and the Barab\\'asi-Albert model. We empirically\nvalidate these findings, observing that the convergence phenomenon appears not\nonly on random graphs but also on some real-world graphs.\n","authors":["Sam Adam-Day","Michael Benedikt","İsmail İlkan Ceylan","Ben Finkelshtein"],"pdf_url":"https://arxiv.org/pdf/2403.03880v2.pdf","comment":"9 body pages, 28 appendix pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.05774v2","updated":"2024-05-23T15:02:53Z","published":"2024-05-09T13:49:45Z","title":"Monoidal bicategories, differential linear logic, and analytic functors","summary":" We develop further the theory of monoidal bicategories by introducing and\nstudying bicategorical counterparts of the notions of a linear explonential\ncomonad, as considered in the study of linear logic, and of a codereliction\ntransformation, introduced to study differential linear logic via differential\ncategories. As an application, we extend the differential calculus of Joyal's\nanalytic functors to analytic functors between presheaf categories, just as\nordinary calculus extends from a single variable to many variables.\n","authors":["M. Fiore","N. Gambino","M. Hyland"],"pdf_url":"https://arxiv.org/pdf/2405.05774v2.pdf","comment":"v2: fixed typos, added references. 46 pages. Comments welcome"},{"id":"http://arxiv.org/abs/2405.14606v1","updated":"2024-05-23T14:19:21Z","published":"2024-05-23T14:19:21Z","title":"Logical Characterizations of Recurrent Graph Neural Networks with Reals\n and Floats","summary":" In pioneering work from 2019, Barcel\\'o and coauthors identified logics that\nprecisely match the expressive power of constant iteration-depth graph neural\nnetworks (GNNs) relative to properties definable in first-order logic. In this\narticle, we give exact logical characterizations of recurrent GNNs in two\nscenarios: (1) in the setting with floating-point numbers and (2) with reals.\nFor floats, the formalism matching recurrent GNNs is a rule-based modal logic\nwith counting, while for reals we use a suitable infinitary modal logic, also\nwith counting. These results give exact matches between logics and GNNs in the\nrecurrent setting without relativising to a background logic in either case,\nbut using some natural assumptions about floating-point arithmetic. Applying\nour characterizations, we also prove that, relative to graph properties\ndefinable in monadic second-order logic (MSO), our infinitary and rule-based\nlogics are equally expressive. This implies that recurrent GNNs with reals and\nfloats have the same expressive power over MSO-definable properties and shows\nthat, for such properties, also recurrent GNNs with reals are characterized by\na (finitary!) rule-based modal logic. In the general case, in contrast, the\nexpressive power with floats is weaker than with reals. In addition to\nlogic-oriented results, we also characterize recurrent GNNs, with both reals\nand floats, via distributed automata, drawing links to distributed computing\nmodels.\n","authors":["Veeti Ahvonen","Damian Heiman","Antti Kuusisto","Carsten Lutz"],"pdf_url":"https://arxiv.org/pdf/2405.14606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14481v1","updated":"2024-05-23T12:13:55Z","published":"2024-05-23T12:13:55Z","title":"A logic of judgmental existence and its relation to proof irrelevance","summary":" We introduce a simple natural deduction system for reasoning with judgments\nof the form \"there exists a proof of $\\varphi$\" to explore the notion of\njudgmental existence following Martin-L\\\"{o}f's methodology of distinguishing\nbetween judgments and propositions. In this system, the existential judgment\ncan be internalized into a modal notion of propositional existence that is\nclosely related to truncation modality, a key tool for obtaining proof\nirrelevance, and lax modality. We provide a computational interpretation in the\nstyle of the Curry-Howard isomorphism for the existence modality and show that\nthe corresponding system has some desirable properties such as strong\nnormalization or subject reduction.\n","authors":["Ivo Pezlar"],"pdf_url":"https://arxiv.org/pdf/2405.14481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02997v2","updated":"2024-05-23T11:20:43Z","published":"2024-02-05T13:35:03Z","title":"A Profunctorial Semantics for Quantum Supermaps","summary":" We identify morphisms of strong profunctors as a categorification of quantum\nsupermaps. These black-box generalisations of diagrams-with-holes are hence\nplaced within the broader field of profunctor optics, as morphisms in the\ncategory of copresheaves on concrete networks. This enables the first\nconstruction of abstract logical connectives such as tensor products and\nnegations for supermaps in a totally theory-independent setting. These logical\nconnectives are found to be all that is needed to abstractly model the key\nstructural features of the quantum theory of supermaps: black-box indefinite\ncausal order, black-box definite causal order, and the factorisation of\ndefinitely causally ordered supermaps into concrete circuit diagrams. We\ndemonstrate that at the heart of these factorisation theorems lies the Yoneda\nlemma and the notion of representability.\n","authors":["James Hefford","Matt Wilson"],"pdf_url":"https://arxiv.org/pdf/2402.02997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14400v1","updated":"2024-05-23T10:19:35Z","published":"2024-05-23T10:19:35Z","title":"Verifying Global Two-Safety Properties in Neural Networks with\n Confidence","summary":" We present the first automated verification technique for confidence-based\n2-safety properties, such as global robustness and global fairness, in deep\nneural networks (DNNs). Our approach combines self-composition to leverage\nexisting reachability analysis techniques and a novel abstraction of the\nsoftmax function, which is amenable to automated verification. We characterize\nand prove the soundness of our static analysis technique. Furthermore, we\nimplement it on top of Marabou, a safety analysis tool for neural networks,\nconducting a performance evaluation on several publicly available benchmarks\nfor DNN verification.\n","authors":["Anagha Athavale","Ezio Bartocci","Maria Christakis","Matteo Maffei","Dejan Nickovic","Georg Weissenbacher"],"pdf_url":"https://arxiv.org/pdf/2405.14400v1.pdf","comment":"Accepted at the 36th International Conference on Computer Aided\n Verification, 2024"},{"id":"http://arxiv.org/abs/2309.14011v2","updated":"2024-05-23T09:36:49Z","published":"2023-09-25T10:25:43Z","title":"A Truly Concurrent Semantics for Reversible CCS","summary":" Reversible CCS (RCCS) is a well-established, formal model for reversible\ncommunicating systems, which has been built on top of the classical Calculus of\nCommunicating Systems (CCS). In its original formulation, each CCS process is\nequipped with a memory that records its performed actions, which is then used\nto reverse computations. More recently, abstract models for RCCS have been\nproposed in the literature, basically, by directly associating RCCS processes\nwith (reversible versions of) event structures. In this paper we propose a\ndifferent abstract model: starting from one of the well-known encoding of CCS\ninto Petri nets we apply a recently proposed approach to incorporate\ncausally-consistent reversibility to Petri nets, obtaining as result the\n(reversible) net counterpart of every RCCS term.\n","authors":["Hernán Melgratti","Claudio Antares Mezzina","G. Michele Pinna"],"pdf_url":"https://arxiv.org/pdf/2309.14011v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14233v1","updated":"2024-05-23T07:08:57Z","published":"2024-05-23T07:08:57Z","title":"Language processing in humans and computers","summary":" Machine-learned language models have transformed everyday life: they steer us\nwhen we study, drive, manage money. They have the potential to transform our\ncivilization. But they hallucinate. Their realities are virtual. This note\nprovides a high-level overview of language models and outlines a low-level\nmodel of learning machines. It turns out that, after they become capable of\nrecognizing hallucinations and dreaming safely, as humans tend to be, the\nlanguage-learning machines proceed to generate broader systems of false beliefs\nand self-confirming theories, as humans tend to do.\n","authors":["Dusko Pavlovic"],"pdf_url":"https://arxiv.org/pdf/2405.14233v1.pdf","comment":"100 pages, 64 figures; lecture notes, book draft"},{"id":"http://arxiv.org/abs/2305.08419v3","updated":"2024-05-23T14:31:19Z","published":"2023-05-15T08:00:55Z","title":"Tractable and Intractable Entailment Problems in Separation Logic with\n Inductively Defined Predicates","summary":" We establish various complexity results for the entailment problem between\nformulas in Separation Logic with user-defined predicates denoting recursive\ndata structures. The considered fragments are characterized by syntactic\nconditions on the inductive rules that define the semantics of the predicates.\nWe focus on so-called P-rules, which are similar to (but simpler than) the PCE\nrules introduced by Iosif et al. in 2013. In particular, for a specific\nfragment where predicates are defined by so-called loc-deterministic inductive\nrules, we devise a sound and complete cyclic proof procedure running in\npolynomial time. Several complexity lower bounds are provided, showing that any\nrelaxing of the provided conditions makes the problem intractable.\n","authors":["Mnacho Echenim","Nicolas Peltier"],"pdf_url":"https://arxiv.org/pdf/2305.08419v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2008.08533v4","updated":"2024-05-23T10:05:36Z","published":"2020-08-19T16:24:55Z","title":"Transpension: The Right Adjoint to the Pi-type","summary":" Presheaf models of dependent type theory have been successfully applied to\nmodel HoTT, parametricity, and directed, guarded and nominal type theory. There\nhas been considerable interest in internalizing aspects of these presheaf\nmodels, either to make the resulting language more expressive, or in order to\ncarry out further reasoning internally, allowing greater abstraction and\nsometimes automated verification. While the constructions of presheaf models\nlargely follow a common pattern, approaches towards internalization do not.\nThroughout the literature, various internal presheaf operators ($\\surd$,\n$\\Phi/\\mathsf{extent}$, $\\Psi/\\mathsf{Gel}$, $\\mathsf{Glue}$, $\\mathsf{Weld}$,\n$\\mathsf{mill}$, the strictness axiom and locally fresh names) can be found and\nlittle is known about their relative expressivenes. Moreover, some of these\nrequire that variables whose type is a shape (representable presheaf, e.g. an\ninterval) be used affinely.\n We propose a novel type former, the transpension type, which is right adjoint\nto universal quantification over a shape. Its structure resembles a dependent\nversion of the suspension type in HoTT. We give general typing rules and a\npresheaf semantics in terms of base category functors dubbed multipliers.\nStructural rules for shape variables and certain aspects of the transpension\ntype depend on characteristics of the multiplier. We demonstrate how the\ntranspension type and the strictness axiom can be combined to implement all and\nimprove some of the aforementioned internalization operators (without formal\nclaim in the case of locally fresh names).\n","authors":["Andreas Nuyts","Dominique Devriese"],"pdf_url":"https://arxiv.org/pdf/2008.08533v4.pdf","comment":"54 pages, 12 figures. Removed tick and lockless notation, changed\n most terminology (dictionary available), other tweaks"},{"id":"http://arxiv.org/abs/2008.08530v4","updated":"2024-05-23T09:49:29Z","published":"2020-08-19T16:13:01Z","title":"The Transpension Type: Technical Report","summary":" The purpose of these notes is to give a categorical semantics for the\ntranspension type (Nuyts and Devriese, Transpension: The Right Adjoint to the\nPi-type, Accepted at LMCS, 2024), which is right adjoint to a potentially\nsubstructural dependent function type. In section 2 we discuss some\nprerequisites. In section 3, we define multipliers and discuss their\nproperties. In section 4, we study how multipliers lift from base categories to\npresheaf categories. In section 5, we explain how typical presheaf modalities\ncan be used in the presence of the transpension type. In section 6, we study\ncommutation properties of prior modalities, substitution modalities and\nmultiplier modalities.\n","authors":["Andreas Nuyts"],"pdf_url":"https://arxiv.org/pdf/2008.08530v4.pdf","comment":"47 pages, 1 figure. Changes: Mainly change in terminology"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Distributed, Parallel, and Cluster Computing 19 + +
+
+
+ + ☆ Recurrent Early Exits for Federated Learning with Heterogeneous Clients + + +
+ Federated learning (FL) has enabled distributed learning of a model across +multiple clients in a privacy-preserving manner. One of the main challenges of +FL is to accommodate clients with varying hardware capacities; clients have +differing compute and memory requirements. To tackle this challenge, recent +state-of-the-art approaches leverage the use of early exits. Nonetheless, these +approaches fall short of mitigating the challenges of joint learning multiple +exit classifiers, often relying on hand-picked heuristic solutions for +knowledge distillation among classifiers and/or utilizing additional layers for +weaker classifiers. In this work, instead of utilizing multiple classifiers, we +propose a recurrent early exit approach named ReeFL that fuses features from +different sub-models into a single shared classifier. Specifically, we use a +transformer-based early-exit module shared among sub-models to i) better +exploit multi-layer feature representations for task-specific prediction and +ii) modulate the feature representation of the backbone model for subsequent +predictions. We additionally present a per-client self-distillation approach +where the best sub-model is automatically selected as the teacher of the other +sub-models at each client. Our experiments on standard image and speech +classification benchmarks across various emerging federated fine-tuning +baselines demonstrate ReeFL's effectiveness over previous works. + +
+
+ comment: Accepted at the 41st International Conference on Machine Learning + (ICML 2024) +
+
+
+
+
+ + ☆ The integration of heterogeneous resources in the CMS Submission + Infrastructure for the LHC Run 3 and beyond + + +
+ While the computing landscape supporting LHC experiments is currently +dominated by x86 processors at WLCG sites, this configuration will evolve in +the coming years. LHC collaborations will be increasingly employing HPC and +Cloud facilities to process the vast amounts of data expected during the LHC +Run 3 and the future HL-LHC phase. These facilities often feature diverse +compute resources, including alternative CPU architectures like ARM and IBM +Power, as well as a variety of GPU specifications. Using these heterogeneous +resources efficiently is thus essential for the LHC collaborations reaching +their future scientific goals. The Submission Infrastructure (SI) is a central +element in CMS Computing, enabling resource acquisition and exploitation by CMS +data processing, simulation and analysis tasks. The SI must therefore be +adapted to ensure access and optimal utilization of this heterogeneous compute +capacity. Some steps in this evolution have been already taken, as CMS is +currently using opportunistically a small pool of GPU slots provided mainly at +the CMS WLCG sites. Additionally, Power9 processors have been validated for CMS +production at the Marconi-100 cluster at CINECA. This note will describe the +updated capabilities of the SI to continue ensuring the efficient allocation +and use of computing resources by CMS, despite their increasing diversity. The +next steps towards a full integration and support of heterogeneous resources +according to CMS needs will also be reported. + +
+
+ comment: 26TH INTERNATIONAL CONFERENCE ON COMPUTING IN HIGH ENERGY & NUCLEAR + PHYSICS - 2023 +
+
+
+
+
+ + ☆ Adoption of a token-based authentication model for the CMS Submission + Infrastructure + + +
+ The CMS Submission Infrastructure (SI) is the main computing resource +provisioning system for CMS workloads. A number of HTCondor pools are employed +to manage this infrastructure, which aggregates geographically distributed +resources from the WLCG and other providers. Historically, the model of +authentication among the diverse components of this infrastructure has relied +on the Grid Security Infrastructure (GSI), based on identities and X509 +certificates. In contrast, commonly used modern authentication standards are +based on capabilities and tokens. The WLCG has identified this trend and aims +at a transparent replacement of GSI for all its workload management, data +transfer and storage access operations, to be completed during the current LHC +Run 3. As part of this effort, and within the context of CMS computing, the +Submission Infrastructure group is in the process of phasing out the GSI part +of its authentication layers, in favor of IDTokens and Scitokens. The use of +tokens is already well integrated into the HTCondor Software Suite, which has +allowed us to fully migrate the authentication between internal components of +SI. Additionally, recent versions of the HTCondor-CE support tokens as well, +enabling CMS resource requests to Grid sites employing this CE technology to be +granted by means of token exchange. After a rollout campaign to sites, +successfully completed by the third quarter of 2022, the totality of HTCondor +CEs in use by CMS are already receiving Scitoken-based pilot jobs. On the ARC +CE side, a parallel campaign was launched to foster the adoption of the REST +interface at CMS sites (required to enable token-based job submission via +HTCondor-G), which is nearing completion as well. In this contribution, the +newly adopted authentication model will be described. We will then report on +the migration status and final steps towards complete GSI phase out in the CMS +SI. + +
+
+ comment: 26TH INTERNATIONAL CONFERENCE ON COMPUTING IN HIGH ENERGY & NUCLEAR + PHYSICS - 2023 +
+
+
+
+
+ + ☆ GPU Implementations for Midsize Integer Addition and Multiplication + + +
+ This paper explores practical aspects of using a high-level functional +language for GPU-based arithmetic on ``midsize'' integers. By this we mean +integers of up to about a quarter million bits, which is sufficient for most +practical purposes. The goal is to understand whether it is possible to support +efficient nested-parallel programs with a small, flexible code base. We report +on GPU implementations for addition and multiplication of integers that fit in +one CUDA block, thus leveraging temporal reuse from scratchpad memories. Our +key contribution resides in the simplicity of the proposed solutions: We +recognize that addition is a straightforward application of scan, which is +known to allow efficient GPU implementation. For quadratic multiplication we +employ a simple work-partitioning strategy that offers good temporal locality. +For FFT multiplication, we efficiently map the computation in the domain of +integral fields by finding ``good'' primes that enable almost-full utilization +of machine words. In comparison, related work uses complex tiling strategies -- +which feel too big a hammer for the job -- or uses the computational domain of +reals, which may degrade the magnitude of the base in which the computation is +carried. We evaluate the performance in comparison to the state-of-the-art CGBN +library, authored by NvidiaLab, and report that our CUDA prototype outperforms +CGBN for integer sizes higher than 32K bits, while offering comparable +performance for smaller sizes. Moreover, we are, to our knowledge, the first to +report that FFT multiplication outperforms the classical one on the larger +sizes that still fit in a CUDA block. Finally, we examine Futhark's strengths +and weaknesses for efficiently supporting such computations and find out that a +compiler pass aimed at efficient sequentialization of excess parallelism would +significantly improve performance. + +
+
+
+
+
+ + ☆ Repurposing of the Run 2 CMS High Level Trigger Infrastructure as a + Cloud Resource for Offline Computing + + +
+ The former CMS Run 2 High Level Trigger (HLT) farm is one of the largest +contributors to CMS compute resources, providing about 25k job slots for +offline computing. This CPU farm was initially employed as an opportunistic +resource, exploited during inter-fill periods, in the LHC Run 2. Since then, it +has become a nearly transparent extension of the CMS capacity at CERN, being +located on-site at the LHC interaction point 5 (P5), where the CMS detector is +installed. This resource has been configured to support the execution of +critical CMS tasks, such as prompt detector data reconstruction. It can +therefore be used in combination with the dedicated Tier 0 capacity at CERN, in +order to process and absorb peaks in the stream of data coming from the CMS +detector. The initial configuration for this resource, based on statically +configured VMs, provided the required level of functionality. However, regular +operations of this cluster revealed certain limitations compared to the +resource provisioning and use model employed in the case of WLCG sites. A new +configuration, based on a vacuum-like model, has been implemented for this +resource in order to solve the detected shortcomings. This paper reports about +this redeployment work on the permanent cloud for an enhanced support to CMS +offline computing, comparing the former and new models' respective +functionalities, along with the commissioning effort for the new setup. + +
+
+ comment: 26TH INTERNATIONAL CONFERENCE ON COMPUTING IN HIGH ENERGY & NUCLEAR + PHYSICS - 2023 +
+
+
+
+
+ + ☆ PerLLM: Personalized Inference Scheduling with Edge-Cloud Collaboration + for Diverse LLM Services + + +
+ With the rapid growth in the number of large language model (LLM) users, it +is difficult for bandwidth-constrained cloud servers to simultaneously process +massive LLM services in real-time. Recently, edge-cloud infrastructures have +been used to improve the processing efficiency of large-scale LLM services. +However, the diversity of task requirements and the dynamics of resources pose +great challenges to inference scheduling, leading to the wastage of many +resources. In this paper, we present PerLLM, a personalized inference +scheduling framework with edge-cloud collaboration designed for diverse LLM +services. For the complexity of multiple constraints and the decision-making +process of edge-cloud collaboration, we integrate the upper confidence bound +algorithm based on the constraint satisfaction mechanism in PerLLM. For diverse +LLM services, PerLLM can optimize service scheduling and resource allocation +solutions within the edge-cloud infrastructure to meet processing time +requirements while minimizing energy costs. Experimental results from different +model deployments show that PerLLM can effectively meet the processing time +requirements of personalized services. Compared to other methods, PerLLM +achieves 2.2x, 2.1x, and 1.6x throughput and reduces the energy cost by more +than 50%. + +
+
+
+
+
+ + ☆ HPC resources for CMS offline computing: An integration and scalability + challenge for the Submission Infrastructure + + +
+ The computing resource needs of LHC experiments are expected to continue +growing significantly during the Run 3 and into the HL-LHC era. The landscape +of available resources will also evolve, as High Performance Computing (HPC) +and Cloud resources will provide a comparable, or even dominant, fraction of +the total compute capacity. The future years present a challenge for the +experiments' resource provisioning models, both in terms of scalability and +increasing complexity. The CMS Submission Infrastructure (SI) provisions +computing resources for CMS workflows. This infrastructure is built on a set of +federated HTCondor pools, currently aggregating 400k CPU cores distributed +worldwide and supporting the simultaneous execution of over 200k computing +tasks. Incorporating HPC resources into CMS computing represents firstly an +integration challenge, as HPC centers are much more diverse compared to Grid +sites. Secondly, evolving the present SI, dimensioned to harness the current +CMS computing capacity, to reach the resource scales required for the HLLHC +phase, while maintaining global flexibility and efficiency, will represent an +additional challenge for the SI. To preventively address future potential +scalability limits, the SI team regularly runs tests to explore the maximum +reach of our infrastructure. In this note, the integration of HPC resources +into CMS offline computing is summarized, the potential concerns for the SI +derived from the increased scale of operations are described, and the most +recent results of scalability test on the CMS SI are reported. + +
+
+ comment: 26TH INTERNATIONAL CONFERENCE ON COMPUTING IN HIGH ENERGY & NUCLEAR + PHYSICS - 2023 +
+
+
+
+
+ + ☆ DEX: Scalable Range Indexing on Disaggregated Memory [Extended Version] + + +
+ Memory disaggregation can potentially allow memory-optimized range indexes +such as B+-trees to scale beyond one machine while attaining high hardware +utilization and low cost. Designing scalable indexes on disaggregated memory, +however, is challenging due to rudimentary caching, unprincipled offloading and +excessive inconsistency among servers. + This paper proposes DEX, a new scalable B+-tree for memory disaggregation. +DEX includes a set of techniques to reduce remote accesses, including logical +partitioning, lightweight caching and cost-aware offloading. Our evaluation +shows that DEX can outperform the state-of-the-art by 1.7--56.3X, and the +advantage remains under various setups, such as cache size and skewness. + +
+
+ comment: 16 pages; To appear at VLDB 2024 +
+
+
+
+
+ + ☆ Worldwide Federated Training of Language Models + + +
+ The reliance of language model training on massive amounts of computation and +vast datasets scraped from potentially low-quality, copyrighted, or sensitive +data has come into question practically, legally, and ethically. Federated +learning provides a plausible alternative by enabling previously untapped data +to be voluntarily gathered from collaborating organizations. However, when +scaled globally, federated learning requires collaboration across heterogeneous +legal, security, and privacy regimes while accounting for the inherent locality +of language data; this further exacerbates the established challenge of +federated statistical heterogeneity. We propose a Worldwide Federated Language +Model Training~(WorldLM) system based on federations of federations, where each +federation has the autonomy to account for factors such as its industry, +operating jurisdiction, or competitive environment. WorldLM enables such +autonomy in the presence of statistical heterogeneity via partial model +localization by allowing sub-federations to attentively aggregate key layers +from their constituents. Furthermore, it can adaptively share information +across federations via residual layer embeddings. Evaluations of language +modeling on naturally heterogeneous datasets show that WorldLM outperforms +standard federations by up to $1.91\times$, approaches the personalized +performance of fully local models, and maintains these advantages under +privacy-enhancing techniques. + +
+
+ comment: 19 pages, 8 figures, Under Review +
+
+
+
+
+ + ☆ GeoFaaS: An Edge-to-Cloud FaaS Platform + + +
+ The massive growth of mobile and IoT devices demands geographically +distributed computing systems for optimal performance, privacy, and +scalability. However, existing edge-to-cloud serverless platforms lack location +awareness, resulting in inefficient network usage and increased latency. + In this paper, we propose GeoFaaS, a novel edge-to-cloud +Function-as-a-Service (FaaS) platform that leverages real-time client location +information for transparent request execution on the nearest available FaaS +node. If needed, GeoFaaS transparently offloads requests to the cloud when edge +resources are overloaded, thus, ensuring consistent execution without user +intervention. GeoFaaS has a modular and decentralized architecture: building on +the single-node FaaS system tinyFaaS, GeoFaaS works as a stand-alone +edge-to-cloud FaaS platform but can also integrate and act as a routing layer +for existing FaaS services, e.g., in the cloud. To evaluate our approach, we +implemented an open-source proof-of-concept prototype and studied performance +and fault-tolerance behavior in experiments. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+ + ☆ EdgeShard: Efficient LLM Inference via Collaborative Edge Computing + + +
+ Large language models (LLMs) have shown great potential in natural language +processing and content generation. However, current LLMs heavily rely on cloud +computing, leading to prolonged latency, high bandwidth cost, and privacy +concerns. Edge computing is promising to address such concerns by deploying +LLMs on edge devices, closer to data sources. Some works try to leverage model +quantization to reduce the model size to fit the resource-constraint edge +devices, but they lead to accuracy loss. Other works use cloud-edge +collaboration, suffering from unstable network connections. In this work, we +leverage collaborative edge computing to facilitate the collaboration among +edge devices and cloud servers for jointly performing efficient LLM inference. +We propose a general framework to partition the LLM model into shards and +deploy on distributed devices. To achieve efficient LLM inference, we formulate +an adaptive joint device selection and model partition problem and design an +efficient dynamic programming algorithm to optimize the inference latency and +throughput, respectively. Experiments of Llama2 serial models on a +heterogeneous physical prototype demonstrate that EdgeShard achieves up to 50% +latency reduction and 2x throughput improvement over baseline methods. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Variational Bayes for Federated Continual Learning + + +
+ Federated continual learning (FCL) has received increasing attention due to +its potential in handling real-world streaming data, characterized by evolving +data distributions and varying client classes over time. The constraints of +storage limitations and privacy concerns confine local models to exclusively +access the present data within each learning cycle. Consequently, this +restriction induces performance degradation in model training on previous data, +termed "catastrophic forgetting". However, existing FCL approaches need to +identify or know changes in data distribution, which is difficult in the real +world. To release these limitations, this paper directs attention to a broader +continuous framework. Within this framework, we introduce Federated Bayesian +Neural Network (FedBNN), a versatile and efficacious framework employing a +variational Bayesian neural network across all clients. Our method continually +integrates knowledge from local and historical data distributions into a single +model, adeptly learning from new data distributions while retaining performance +on historical distributions. We rigorously evaluate FedBNN's performance +against prevalent methods in federated learning and continual learning using +various metrics. Experimental analyses across diverse datasets demonstrate that +FedBNN achieves state-of-the-art results in mitigating forgetting. + +
+
+
+
+
+ + ☆ Distributed Speculative Inference of Large Language Models + + +
+ Accelerating the inference of large language models (LLMs) is an important +challenge in artificial intelligence. This paper introduces distributed +speculative inference (DSI), a novel distributed inference algorithm that is +provably faster than speculative inference (SI) [leviathan2023fast, +chen2023accelerating, miao2023specinfer] and traditional autoregressive +inference (non-SI). Like other SI algorithms, DSI works on frozen LLMs, +requiring no training or architectural modifications, and it preserves the +target distribution. + Prior studies on SI have demonstrated empirical speedups (compared to non-SI) +but require a fast and accurate drafter LLM. In practice, off-the-shelf LLMs +often do not have matching drafters that are sufficiently fast and accurate. We +show a gap: SI gets slower than non-SI when using slower or less accurate +drafters. We close this gap by proving that DSI is faster than both SI and +non-SI given any drafters. By orchestrating multiple instances of the target +and drafters, DSI is not only faster than SI but also supports LLMs that cannot +be accelerated with SI. + Our simulations show speedups of off-the-shelf LLMs in realistic settings: +DSI is 1.29-1.92x faster than SI. + +
+
+
+
+
+ + ♻ ☆ Ephemeral Rollups are All you Need + + +
+ In the realm of open and composable gaming, we envision platforms where users +actively expand, create, engage, and immerse themselves in a rich world of +entertainment. One promising avenue for achieving this vision is through fully +on-chain (FOC) games, where both game state and logic reside on the blockchain, +maximizing composability. However, we must grapple with inherent limitations +and trade-offs, particularly in terms of costs and scalability. This paper +proposes a framework that leverages the Solana Virtual Machine (SVM) to scale +FOC games without state fragmentation or compromised trust assumptions. The +framework introduces a systematic approach for discovering, utilizing, and +publishing modular pieces of logic as components deeply rooted in the +Entity-Component-System (ECS) pattern. To enhance scalability and resource +optimization, we introduce the concept of Ephemeral Rollups (ERs) that overcome +the tradeoffs of L2s horizontal scaling. These dedicated runtimes can be +customized to provide higher operational speed, configurable ticking +mechanisms, provable sessions and gasless transactions without +composability-scalability tradeoffs. + +
+
+
+
+
+ + ♻ ☆ cuFastTuckerPlus: A Stochastic Parallel Sparse FastTucker Decomposition + Using GPU Tensor Cores + + +
+ Sparse tensors are prevalent in real-world applications, often characterized +by their large-scale, high-order, and high-dimensional nature. Directly +handling raw tensors is impractical due to the significant memory and +computational overhead involved. The current mainstream approach involves +compressing or decomposing the original tensor. One popular tensor +decomposition algorithm is the Tucker decomposition. However, existing +state-of-the-art algorithms for large-scale Tucker decomposition typically +relax the original optimization problem into multiple convex optimization +problems to ensure polynomial convergence. Unfortunately, these algorithms tend +to converge slowly. In contrast, tensor decomposition exhibits a simple +optimization landscape, making local search algorithms capable of converging to +a global (approximate) optimum much faster. In this paper, we propose the +FastTuckerPlus algorithm, which decomposes the original optimization problem +into two non-convex optimization problems and solves them alternately using the +Stochastic Gradient Descent method. Furthermore, we introduce cuFastTuckerPlus, +a fine-grained parallel algorithm designed for GPU platforms, leveraging the +performance of tensor cores. This algorithm minimizes memory access overhead +and computational costs, surpassing the state-of-the-art algorithms. Our +experimental results demonstrate that our method achieves a speedup of $3X$ to +$5X$ compared to state-of-the-art algorithms. + +
+
+
+
+
+ + ♻ ☆ TrustRate: A Decentralized Platform for Hijack-Resistant Anonymous + Reviews + + +
+ Reviews and ratings by users form a central component in several widely used +products today (e.g., product reviews, ratings of online content, etc.), but +today's platforms for managing such reviews are ad-hoc and vulnerable to +various forms of tampering and hijack by fake reviews either by bots or +motivated paid workers. We define a new metric called 'hijack-resistance' for +such review platforms, and then present TrustRate, an end-to-end decentralized, +hijack-resistant platform for authentic, anonymous, tamper-proof reviews. With +a prototype implementation and evaluation at the scale of thousands of nodes, +we demonstrate the efficacy and performance of our platform, towards a new +paradigm for building products based on trusted reviews by end users without +having to trust a single organization that manages the reviews. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ Ravnest: Decentralized Asynchronous Training on Heterogeneous Devices + + +
+ Modern deep learning models, growing larger and more complex, have +demonstrated exceptional generalization and accuracy due to training on huge +datasets. This trend is expected to continue. However, the increasing size of +these models poses challenges in training, as traditional centralized methods +are limited by memory constraints at such scales. This paper proposes an +asynchronous decentralized training paradigm for large modern deep learning +models that harnesses the compute power of regular heterogeneous PCs with +limited resources connected across the internet to achieve favourable +performance metrics. Ravnest facilitates decentralized training by efficiently +organizing compute nodes into clusters with similar data transfer rates and +compute capabilities, without necessitating that each node hosts the entire +model. These clusters engage in $\textit{Zero-Bubble Asynchronous Model +Parallel}$ training, and a $\textit{Parallel Multi-Ring All-Reduce}$ method is +employed to effectively execute global parameter averaging across all clusters. +We have framed our asynchronous SGD loss function as a block structured +optimization problem with delayed updates and derived an optimal convergence +rate of $O\left(\frac{1}{\sqrt{K}}\right)$. We further discuss linear speedup +with respect to the number of participating clusters and the bound on the +staleness parameter. + +
+
+ comment: 29 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ ZeroPP: Unleashing Exceptional Parallelism Efficiency through + Tensor-Parallelism-Free Methodology + + +
+ Large-scale models rely heavily on 3D parallelism for distributed training, +which utilizes tensor parallelism (TP) as the intra-operator parallelism to +partition model states across GPUs. However, TP introduces significant +communication overheads and complexity in modifying single-GPU code. In this +paper, we propose a TP-free distributed framework ZeroPP, which leverages the +hybrid of scalable inter-operator pipeline parallelism and intra-operator fully +sharded data parallelism to train models at scale, reducing memory consumption +and enabling high training efficiency. Through extensive experimentation, we +demonstrate that ZeroPP achieves significant performance gains of up to 33% +compared to conventional 3D parallelism while maintaining comparable GPU memory +consumption. + +
+
+
+
+
+ + ♻ ☆ Load Balancing Using Sparse Communication + + +
+ Load balancing across parallel servers is an important class of congestion +control problems that arises in service systems. An effective load balancer +relies heavily on accurate, real-time congestion information to make routing +decisions. However, obtaining such information can impose significant +communication overheads, especially in demanding applications like those found +in modern data centers. + We introduce a framework for communication-aware load balancing and design +new load balancing algorithms that perform exceptionally well even in scenarios +with sparse communication patterns. Central to our approach is state +approximation, where the load balancer first estimates server states through a +communication protocol. Subsequently, it utilizes these approximate states +within a load balancing algorithm to determine routing decisions. + We demonstrate that by using a novel communication protocol, one can achieve +accurate queue length approximation with sparse communication: for a maximal +approximation error of x, the communication frequency only needs to be +O(1/x^2). We further show, via a diffusion analysis, that a constant maximal +approximation error is sufficient for achieving asymptotically optimal +performance. Taken together, these results therefore demonstrate that highly +performant load balancing is possible with very little communication. Through +simulations, we observe that the proposed designs match or surpass the +performance of state-of-the-art load balancing algorithms while drastically +reducing communication rates by up to 90%. + +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 3 + +
+
+
+ + ☆ A Language-Theoretic Approach to the Heapability of Signed Permutations + + +
+ We investigate a signed version of the Hammersley process, a discrete process +on words related to a property of integer sequences called heapability (Byers +et al., ANALCO 2011). The specific version that we investigate corresponds to a +version of this property for signed sequences. + We give a characterization of the words that can appear as images the signed +Hammersley process. In particular we show that the language of such words is +the intersection of two deterministic one-counter languages. + +
+
+
+
+
+ + ☆ Nominal Tree Automata With Name Allocation + + +
+ Data trees serve as an abstraction of structured data, such as XML documents. +A number of specification formalisms for languages of data trees have been +developed, many of them adhering to the paradigm of register automata, which is +based on storing data values encountered on the tree in registers for +subsequent comparison with further data values. Already on word languages, the +expressiveness of such automata models typically increases with the power of +control (e.g. deterministic, non-deterministic, alternating). Language +inclusion is typically undecidable for non-deterministic or alternating models +unless the number of registers is radically restricted, and even then often +remains non-elementary. We present an automaton model for data trees that +retains a reasonable level of expressiveness, in particular allows +non-determinism and any number of registers, while admitting language inclusion +checking in elementary complexity, in fact in parametrized exponential time. We +phrase the description of our automaton model in the language of nominal sets, +building on the recently introduced paradigm of explicit name allocation in +nominal automata. + +
+
+
+
+
+ + ♻ ☆ MUSTARD: Mastering Uniform Synthesis of Theorem and Proof Data + + +
+ Recent large language models (LLMs) have witnessed significant advancement in +various tasks, including mathematical reasoning and theorem proving. As these +two tasks require strict and formal multi-step inference, they are appealing +domains for exploring the reasoning ability of LLMs but still face important +challenges. Previous studies such as Chain-of-Thought (CoT) have revealed the +effectiveness of intermediate steps guidance. However, such step-wise +annotation requires heavy labor, leading to insufficient training steps for +current benchmarks. To fill this gap, this work introduces MUSTARD, a data +generation framework that masters uniform synthesis of theorem and proof data +of high quality and diversity. MUSTARD synthesizes data in three stages: (1) It +samples a few mathematical concept seeds as the problem category. (2) Then, it +prompts a generative language model with the sampled concepts to obtain both +the problems and their step-wise formal solutions. (3) Lastly, the framework +utilizes a proof assistant (e.g., Lean Prover) to filter the valid proofs. With +the proposed MUSTARD, we present a theorem-and-proof benchmark MUSTARDSAUCE +with 5,866 valid data points. Each data point contains an informal statement, +an informal proof, and a translated formal proof that passes the prover +validation. We perform extensive analysis and demonstrate that MUSTARD +generates validated high-quality step-by-step data. We further apply the +MUSTARDSAUCE for fine-tuning smaller language models. The fine-tuned Llama 2-7B +achieves a 15.41% average relative performance gain in automated theorem +proving, and 8.18% in math word problems. Codes and data are available at +https://github.com/Eleanor-H/MUSTARD. + +
+
+
+
+
+
+
+
+ + Hardware Architecturea 3 + +
+
+
+ + ☆ Exploring and Evaluating Real-world CXL: Use Cases and System Adoption + + +
+ Compute eXpress Link (CXL) is emerging as a promising memory interface +technology. Because of the common unavailiability of CXL devices, the +performance of the CXL memory is largely unknown. What are the use cases for +the CXL memory? What are the impacts of the CXL memory on application +performance? How to use the CXL memory in combination with existing memory +components? In this work, we study the performance of three genuine CXL +memory-expansion cards from different vendors. We characterize the basic +performance of the CXL memory, study how HPC applications and large language +models can benefit from the CXL memory, and study the interplay between memory +tiering and page interleaving. We also propose a novel data object-level +interleaving policy to match the interleaving policy with memory access +patterns. We reveal the challenges and opportunities of using the CXL memory. + +
+
+
+
+
+ + ♻ ☆ On-Chip Hardware-Aware Quantization for Mixed Precision Neural Networks + + +
+ Low-bit quantization emerges as one of the most promising compression +approaches for deploying deep neural networks on edge devices. Mixed-precision +quantization leverages a mixture of bit-widths to unleash the accuracy and +efficiency potential of quantized models. However, existing mixed-precision +quantization methods rely on simulations in high-performance devices to achieve +accuracy and efficiency trade-offs in immense search spaces. This leads to a +non-negligible gap between the estimated efficiency metrics and the actual +hardware that makes quantized models far away from the optimal accuracy and +efficiency, and also causes the quantization process to rely on additional +high-performance devices. In this paper, we propose an On-Chip Hardware-Aware +Quantization (OHQ) framework, performing hardware-aware mixed-precision +quantization on deployed edge devices to achieve accurate and efficient +computing. Specifically, for efficiency metrics, we built an On-Chip +Quantization Aware pipeline, which allows the quantization process to perceive +the actual hardware efficiency of the quantization operator and avoid +optimization errors caused by inaccurate simulation. For accuracy metrics, we +propose Mask-Guided Quantization Estimation technology to effectively estimate +the accuracy impact of operators in the on-chip scenario, getting rid of the +dependence of the quantization process on high computing power. By synthesizing +insights from quantized models and hardware through linear optimization, we can +obtain optimized bit-width configurations to achieve outstanding performance on +accuracy and efficiency. We evaluate inference accuracy and acceleration with +quantization for various architectures and compression ratios on hardware. OHQ +achieves 70% and 73% accuracy for ResNet-18 and MobileNetV3, respectively, and +can reduce latency by 15~30% compared to INT8 on real deployment. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ DL2Fence: Integrating Deep Learning and Frame Fusion for Enhanced + Detection and Localization of Refined Denial-of-Service in Large-Scale NoCs + + +
+ This study introduces a refined Flooding Injection Rate-adjustable +Denial-of-Service (DoS) model for Network-on-Chips (NoCs) and more importantly +presents DL2Fence, a novel framework utilizing Deep Learning (DL) and Frame +Fusion (2F) for DoS detection and localization. Two Convolutional Neural +Networks models for classification and segmentation were developed to detect +and localize DoS respectively. It achieves detection and localization +accuracies of 95.8% and 91.7%, and precision rates of 98.5% and 99.3% in a +16x16 mesh NoC. The framework's hardware overhead notably decreases by 76.3% +when scaling from 8x8 to 16x16 NoCs, and it requires 42.4% less hardware +compared to state-of-the-arts. This advancement demonstrates DL2Fence's +effectiveness in balancing outstanding detection performance in large-scale +NoCs with extremely low hardware overhead. + +
+
+
+
+
+
+
+
+ + Programming and Languages 2 + +
+
+
+ + ☆ GPU Implementations for Midsize Integer Addition and Multiplication + + +
+ This paper explores practical aspects of using a high-level functional +language for GPU-based arithmetic on ``midsize'' integers. By this we mean +integers of up to about a quarter million bits, which is sufficient for most +practical purposes. The goal is to understand whether it is possible to support +efficient nested-parallel programs with a small, flexible code base. We report +on GPU implementations for addition and multiplication of integers that fit in +one CUDA block, thus leveraging temporal reuse from scratchpad memories. Our +key contribution resides in the simplicity of the proposed solutions: We +recognize that addition is a straightforward application of scan, which is +known to allow efficient GPU implementation. For quadratic multiplication we +employ a simple work-partitioning strategy that offers good temporal locality. +For FFT multiplication, we efficiently map the computation in the domain of +integral fields by finding ``good'' primes that enable almost-full utilization +of machine words. In comparison, related work uses complex tiling strategies -- +which feel too big a hammer for the job -- or uses the computational domain of +reals, which may degrade the magnitude of the base in which the computation is +carried. We evaluate the performance in comparison to the state-of-the-art CGBN +library, authored by NvidiaLab, and report that our CUDA prototype outperforms +CGBN for integer sizes higher than 32K bits, while offering comparable +performance for smaller sizes. Moreover, we are, to our knowledge, the first to +report that FFT multiplication outperforms the classical one on the larger +sizes that still fit in a CUDA block. Finally, we examine Futhark's strengths +and weaknesses for efficiently supporting such computations and find out that a +compiler pass aimed at efficient sequentialization of excess parallelism would +significantly improve performance. + +
+
+
+
+
+ + ♻ ☆ MUSTARD: Mastering Uniform Synthesis of Theorem and Proof Data + + +
+ Recent large language models (LLMs) have witnessed significant advancement in +various tasks, including mathematical reasoning and theorem proving. As these +two tasks require strict and formal multi-step inference, they are appealing +domains for exploring the reasoning ability of LLMs but still face important +challenges. Previous studies such as Chain-of-Thought (CoT) have revealed the +effectiveness of intermediate steps guidance. However, such step-wise +annotation requires heavy labor, leading to insufficient training steps for +current benchmarks. To fill this gap, this work introduces MUSTARD, a data +generation framework that masters uniform synthesis of theorem and proof data +of high quality and diversity. MUSTARD synthesizes data in three stages: (1) It +samples a few mathematical concept seeds as the problem category. (2) Then, it +prompts a generative language model with the sampled concepts to obtain both +the problems and their step-wise formal solutions. (3) Lastly, the framework +utilizes a proof assistant (e.g., Lean Prover) to filter the valid proofs. With +the proposed MUSTARD, we present a theorem-and-proof benchmark MUSTARDSAUCE +with 5,866 valid data points. Each data point contains an informal statement, +an informal proof, and a translated formal proof that passes the prover +validation. We perform extensive analysis and demonstrate that MUSTARD +generates validated high-quality step-by-step data. We further apply the +MUSTARDSAUCE for fine-tuning smaller language models. The fine-tuned Llama 2-7B +achieves a 15.41% average relative performance gain in automated theorem +proving, and 8.18% in math word problems. Codes and data are available at +https://github.com/Eleanor-H/MUSTARD. + +
+
+
+
+
+
+
+
+ + Performance Profiling 3 + +
+
+
+ + ☆ PipeFusion: Displaced Patch Pipeline Parallelism for Inference of + Diffusion Transformer Models + + +
+ This paper introduces PipeFusion, a novel approach that harnesses multi-GPU +parallelism to address the high computational and latency challenges of +generating high-resolution images with diffusion transformers (DiT) models. +PipeFusion splits images into patches and distributes the network layers across +multiple devices. It employs a pipeline parallel manner to orchestrate +communication and computations. By leveraging the high similarity between the +input from adjacent diffusion steps, PipeFusion eliminates the waiting time in +the pipeline by reusing the one-step stale feature maps to provide context for +the current step. Our experiments demonstrate that it can generate higher image +resolution where existing DiT parallel approaches meet OOM. PipeFusion +significantly reduces the required communication bandwidth, enabling DiT +inference to be hosted on GPUs connected via PCIe rather than the more costly +NVLink infrastructure, which substantially lowers the overall operational +expenses for serving DiT models. Our code is publicly available at +https://github.com/PipeFusion/PipeFusion. + +
+
+
+
+
+ + ☆ Exploring and Evaluating Real-world CXL: Use Cases and System Adoption + + +
+ Compute eXpress Link (CXL) is emerging as a promising memory interface +technology. Because of the common unavailiability of CXL devices, the +performance of the CXL memory is largely unknown. What are the use cases for +the CXL memory? What are the impacts of the CXL memory on application +performance? How to use the CXL memory in combination with existing memory +components? In this work, we study the performance of three genuine CXL +memory-expansion cards from different vendors. We characterize the basic +performance of the CXL memory, study how HPC applications and large language +models can benefit from the CXL memory, and study the interplay between memory +tiering and page interleaving. We also propose a novel data object-level +interleaving policy to match the interleaving policy with memory access +patterns. We reveal the challenges and opportunities of using the CXL memory. + +
+
+
+
+
+ + ☆ A structure-aware framework for learning device placements on + computation graphs + + +
+ Existing approaches for device placement ignore the topological features of +computation graphs and rely mostly on heuristic methods for graph partitioning. +At the same time, they either follow a grouper-placer or an encoder-placer +architecture, which requires understanding the interaction structure between +code operations. To bridge the gap between encoder-placer and grouper-placer +techniques, we propose a novel framework for the task of device placement, +relying on smaller computation graphs extracted from the OpenVINO toolkit using +reinforcement learning. The framework consists of five steps, including graph +coarsening, node representation learning and policy optimization. It +facilitates end-to-end training and takes into consideration the directed and +acyclic nature of the computation graphs. We also propose a model variant, +inspired by graph parsing networks and complex network analysis, enabling graph +representation learning and personalized graph partitioning jointly, using an +unspecified number of groups. To train the entire framework, we utilize +reinforcement learning techniques by employing the execution time of the +suggested device placements to formulate the reward. We demonstrate the +flexibility and effectiveness of our approach through multiple experiments with +three benchmark models, namely Inception-V3, ResNet, and BERT. The robustness +of the proposed framework is also highlighted through an ablation study. The +suggested placements improve the inference speed for the benchmark models by up +to $58.2\%$ over CPU execution and by up to $60.24\%$ compared to other +commonly used baselines. + +
+
+
+
+
+
+
+
+ + Computational Complexity 6 + +
+
+
+ + ☆ Polynomial Pass Semi-Streaming Lower Bounds for K-Cores and Degeneracy + + +
+ The following question arises naturally in the study of graph streaming +algorithms: + "Is there any graph problem which is "not too hard", in that it can be solved +efficiently with total communication (nearly) linear in the number $n$ of +vertices, and for which, nonetheless, any streaming algorithm with +$\tilde{O}(n)$ space (i.e., a semi-streaming algorithm) needs a polynomial +$n^{\Omega(1)}$ number of passes?" + Assadi, Chen, and Khanna [STOC 2019] were the first to prove that this is +indeed the case. However, the lower bounds that they obtained are for rather +non-standard graph problems. + Our first main contribution is to present the first polynomial-pass lower +bounds for natural "not too hard" graph problems studied previously in the +streaming model: $k$-cores and degeneracy. We devise a novel communication +protocol for both problems with near-linear communication, thus showing that +$k$-cores and degeneracy are natural examples of "not too hard" problems. +Indeed, previous work have developed single-pass semi-streaming algorithms for +approximating these problems. In contrast, we prove that any semi-streaming +algorithm for exactly solving these problems requires (almost) +$\Omega(n^{1/3})$ passes. + Our second main contribution is improved round-communication lower bounds for +the underlying communication problems at the basis of these reductions: + * We improve the previous lower bound of Assadi, Chen, and Khanna for hidden +pointer chasing (HPC) to achieve optimal bounds. + * We observe that all current reductions from HPC can also work with a +generalized version of this problem that we call MultiHPC, and prove an even +stronger and optimal lower bound for this generalization. + These two results collectively allow us to improve the resulting pass lower +bounds for semi-streaming algorithms by a polynomial factor, namely, from +$n^{1/5}$ to $n^{1/3}$ passes. + +
+
+ comment: Accepted at CCC 2024 +
+
+
+
+
+ + ♻ ☆ Chain of Thought Empowers Transformers to Solve Inherently Serial + Problems + + +
+ Instructing the model to generate a sequence of intermediate steps, a.k.a., a +chain of thought (CoT), is a highly effective method to improve the accuracy of +large language models (LLMs) on arithmetics and symbolic reasoning tasks. +However, the mechanism behind CoT remains unclear. This work provides a +theoretical understanding of the power of CoT for decoder-only transformers +through the lens of expressiveness. Conceptually, CoT empowers the model with +the ability to perform inherently serial computation, which is otherwise +lacking in transformers, especially when depth is low. Given input length $n$, +previous works have shown that constant-depth transformers with finite +precision $\mathsf{poly}(n)$ embedding size can only solve problems in +$\mathsf{TC}^0$ without CoT. We first show an even tighter expressiveness upper +bound for constant-depth transformers with constant-bit precision, which can +only solve problems in $\mathsf{AC}^0$, a proper subset of $ \mathsf{TC}^0$. +However, with $T$ steps of CoT, constant-depth transformers using constant-bit +precision and $O(\log n)$ embedding size can solve any problem solvable by +boolean circuits of size $T$. Empirically, enabling CoT dramatically improves +the accuracy for tasks that are hard for parallel computation, including the +composition of permutation groups, iterated squaring, and circuit value +problems, especially for low-depth transformers. + +
+
+ comment: 38 pages, 10 figures. Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Nearest Neighbor Complexity and Boolean Circuits + + +
+ A nearest neighbor representation of a Boolean function $f$ is a set of +vectors (anchors) labeled by $0$ or $1$ such that $f(\vec{x}) = 1$ if and only +if the closest anchor to $\vec{x}$ is labeled by $1$. This model was introduced +by Hajnal, Liu, and Tur\'an (2022), who studied bounds on the number of anchors +required to represent Boolean functions under different choices of anchors +(real vs. Boolean vectors) as well as the more expressive model of $k$-nearest +neighbors. + We initiate the study of the representational power of nearest and +$k$-nearest neighbors through Boolean circuit complexity. To this end, we +establish a connection between Boolean functions with polynomial nearest +neighbor complexity and those that can be efficiently represented by classes +based on linear inequalities -- min-plus polynomial threshold functions -- +previously studied in relation to threshold circuits. This extends an +observation of Hajnal et al. (2022). We obtain exponential lower bounds on the +$k$-nearest neighbors complexity of explicit $n$-variate functions, assuming $k +\leq n^{1-\epsilon}$. Previously, no superlinear lower bound was known for any +$k>1$. + Next, we further extend the connection between nearest neighbor +representations and circuits to the $k$-nearest neighbors case. As a result, we +show that proving superpolynomial lower bounds for the $k$-nearest neighbors +complexity of an explicit function for arbitrary $k$ would require a +breakthrough in circuit complexity. In addition, we prove an exponential +separation between the nearest neighbor and $k$-nearest neighbors complexity +(for unrestricted $k$) of an explicit function. These results address questions +raised by Hajnal et al. (2022) of proving strong lower bounds for $k$-nearest +neighbors and understanding the role of the parameter $k$. Finally, we devise +new bounds on the nearest neighbor complexity for several explicit functions. + +
+
+ comment: Minor corrections +
+
+
+
+
+ + ♻ ☆ Causal Discovery under Latent Class Confounding + + +
+ An acyclic causal structure can be described using a directed acyclic graph +(DAG) with arrows indicating causation. The task of learning this structure +from data is known as "causal discovery." Diverse populations or changing +environments can sometimes give rise to heterogeneous data. This heterogeneity +can be thought of as a mixture model with multiple "sources," each exerting +their own distinct signature on the observed variables. From this perspective, +the source is a latent common cause for every observed variable. While some +methods for causal discovery are able to work around unobserved confounding in +special cases, the only known ways to deal with a global confounder (such as a +latent class) involve parametric assumptions. Focusing on discrete observables, +we demonstrate that globally confounded causal structures can still be +identifiable without parametric assumptions, so long as the number of latent +classes remains small relative to the size and sparsity of the underlying DAG. + +
+
+
+
+
+ + ♻ ☆ Space-bounded quantum state testing via space-efficient quantum singular + value transformation + + +
+ Driven by exploring the power of quantum computation with a limited number of +qubits, we present a novel complete characterization for space-bounded quantum +computation, which encompasses settings with one-sided error (unitary coRQL) +and two-sided error (BQL), approached from a quantum state testing perspective: + - The first family of natural complete problems for unitary coRQL, i.e., +space-bounded quantum state certification for trace distance and +Hilbert-Schmidt distance; + - A new family of natural complete problems for BQL, i.e., space-bounded +quantum state testing for trace distance, Hilbert-Schmidt distance, and quantum +entropy difference. + In the space-bounded quantum state testing problem, we consider two +logarithmic-qubit quantum circuits (devices) denoted as $Q_0$ and $Q_1$, which +prepare quantum states $\rho_0$ and $\rho_1$, respectively, with access to +their ``source code''. Our goal is to decide whether $\rho_0$ is +$\epsilon_1$-close to or $\epsilon_2$-far from $\rho_1$ with respect to a +specified distance-like measure. Interestingly, unlike time-bounded state +testing problems, our results reveal that the space-bounded state testing +problems all correspond to the same class. Moreover, our algorithms on the +trace distance inspire an algorithmic Holevo-Helstrom measurement, implying +QSZK is in QIP(2) with a quantum linear-space honest prover. + Our results primarily build upon a space-efficient variant of the quantum +singular value transformation (QSVT) introduced by Gily\'en, Su, Low, and Wiebe +(STOC 2019), which is of independent interest. Our technique provides a unified +approach for designing space-bounded quantum algorithms. Specifically, we show +that implementing QSVT for any bounded polynomial that approximates a +piecewise-smooth function incurs only a constant overhead in terms of the space +required for special forms of the projected unitary encoding. + +
+
+ comment: 71 pages, 3 figures. v2: improved error and norm bounds in + space-efficient polynomial approximation (Section 3.1), clarified the + application scope of the robust oblivious amplitude amplification in Theorem + 3.10, and added new results on algorithmic Holevo-Helstrom measurement and a + slightly improved upper bound for QSZK (Section 5) +
+
+
+
+
+ + ♻ ☆ Local Enumeration and Majority Lower Bounds + + +
+ Depth-3 circuit lower bounds and $k$-SAT algorithms are intimately related; +the state-of-the-art $\Sigma^k_3$-circuit lower bound and the $k$-SAT algorithm +are based on the same combinatorial theorem. In this paper we define a problem +which reveals new interactions between the two. Define Enum($k$, $t$) problem +as: given an $n$-variable $k$-CNF and an initial assignment $\alpha$, output +all satisfying assignments at Hamming distance $t$ from $\alpha$, assuming that +there are no satisfying assignments of Hamming distance less than $t$ from +$\alpha$. Observe that: an upper bound $b(n, k, t)$ on the complexity of +Enum($k$, $t$) implies: + - Depth-3 circuits: Any $\Sigma^k_3$ circuit computing the Majority function +has size at least $\binom{n}{\frac{n}{2}}/b(n, k, \frac{n}{2})$. + - $k$-SAT: There exists an algorithm solving $k$-SAT in time $O(\sum_{t = +1}^{n/2}b(n, k, t))$. + A simple construction shows that $b(n, k, \frac{n}{2}) \ge 2^{(1 - +O(\log(k)/k))n}$. Thus, matching upper bounds would imply a +$\Sigma^k_3$-circuit lower bound of $2^{\Omega(\log(k)n/k)}$ and a $k$-SAT +upper bound of $2^{(1 - \Omega(\log(k)/k))n}$. The former yields an +unrestricted depth-3 lower bound of $2^{\omega(\sqrt{n})}$ solving a long +standing open problem, and the latter breaks the Super Strong Exponential Time +Hypothesis. + In this paper, we propose a randomized algorithm for Enum($k$, $t$) and +introduce new ideas to analyze it. We demonstrate the power of our ideas by +considering the first non-trivial instance of the problem, i.e., Enum($3$, +$\frac{n}{2}$). We show that the expected running time of our algorithm is +$1.598^n$, substantially improving on the trivial bound of $3^{n/2} \simeq +1.732^n$. This already improves $\Sigma^3_3$ lower bounds for Majority function +to $1.251^n$. The previous bound was $1.154^n$ which follows from the work of +H{\aa}stad, Jukna, and Pudl\'ak (Comput. Complex.'95). + +
+
+
+
+
+
+
+
+ + Logic in Computer Science 12 + +
+
+
+ + ☆ Measuring data types + + +
+ In this article, we combine Sweedler's classic theory of measuring coalgebras +-- by which $k$-algebras are enriched in $k$-coalgebras for $k$ a field -- with +the theory of W-types -- by which the categorical semantics of inductive data +types in functional programming languages are understood. In our main theorem, +we find that under some hypotheses, algebras of an endofunctor are enriched in +coalgebras of the same endofunctor, and we find polynomial endofunctors provide +many interesting examples of this phenomenon. We then generalize the notion of +initial algebra of an endofunctor using this enrichment, thus generalizing the +notion of W-type. This article is an extended version of arXiv:2303.16793, it +adds expository introductions to the original theories of measuring coalgebras +and W-types along with some improvements to the main theory and many explicitly +worked examples. + +
+
+ comment: 67 pages +
+
+
+
+
+ + ☆ Logical Characterizations of Recurrent Graph Neural Networks with Reals + and Floats + + +
+ In pioneering work from 2019, Barcel\'o and coauthors identified logics that +precisely match the expressive power of constant iteration-depth graph neural +networks (GNNs) relative to properties definable in first-order logic. In this +article, we give exact logical characterizations of recurrent GNNs in two +scenarios: (1) in the setting with floating-point numbers and (2) with reals. +For floats, the formalism matching recurrent GNNs is a rule-based modal logic +with counting, while for reals we use a suitable infinitary modal logic, also +with counting. These results give exact matches between logics and GNNs in the +recurrent setting without relativising to a background logic in either case, +but using some natural assumptions about floating-point arithmetic. Applying +our characterizations, we also prove that, relative to graph properties +definable in monadic second-order logic (MSO), our infinitary and rule-based +logics are equally expressive. This implies that recurrent GNNs with reals and +floats have the same expressive power over MSO-definable properties and shows +that, for such properties, also recurrent GNNs with reals are characterized by +a (finitary!) rule-based modal logic. In the general case, in contrast, the +expressive power with floats is weaker than with reals. In addition to +logic-oriented results, we also characterize recurrent GNNs, with both reals +and floats, via distributed automata, drawing links to distributed computing +models. + +
+
+
+
+
+ + ☆ A logic of judgmental existence and its relation to proof irrelevance + + +
+ We introduce a simple natural deduction system for reasoning with judgments +of the form "there exists a proof of $\varphi$" to explore the notion of +judgmental existence following Martin-L\"{o}f's methodology of distinguishing +between judgments and propositions. In this system, the existential judgment +can be internalized into a modal notion of propositional existence that is +closely related to truncation modality, a key tool for obtaining proof +irrelevance, and lax modality. We provide a computational interpretation in the +style of the Curry-Howard isomorphism for the existence modality and show that +the corresponding system has some desirable properties such as strong +normalization or subject reduction. + +
+
+
+
+
+ + ☆ Verifying Global Two-Safety Properties in Neural Networks with + Confidence + + +
+ We present the first automated verification technique for confidence-based +2-safety properties, such as global robustness and global fairness, in deep +neural networks (DNNs). Our approach combines self-composition to leverage +existing reachability analysis techniques and a novel abstraction of the +softmax function, which is amenable to automated verification. We characterize +and prove the soundness of our static analysis technique. Furthermore, we +implement it on top of Marabou, a safety analysis tool for neural networks, +conducting a performance evaluation on several publicly available benchmarks +for DNN verification. + +
+
+ comment: Accepted at the 36th International Conference on Computer Aided + Verification, 2024 +
+
+
+
+
+ + ☆ Language processing in humans and computers + + +
+ Machine-learned language models have transformed everyday life: they steer us +when we study, drive, manage money. They have the potential to transform our +civilization. But they hallucinate. Their realities are virtual. This note +provides a high-level overview of language models and outlines a low-level +model of learning machines. It turns out that, after they become capable of +recognizing hallucinations and dreaming safely, as humans tend to be, the +language-learning machines proceed to generate broader systems of false beliefs +and self-confirming theories, as humans tend to do. + +
+
+ comment: 100 pages, 64 figures; lecture notes, book draft +
+
+
+
+
+ + ♻ ☆ Almost Surely Asymptotically Constant Graph Neural Networks + + +
+ We present a new angle on the expressive power of graph neural networks +(GNNs) by studying how the predictions of a GNN probabilistic classifier evolve +as we apply it on larger graphs drawn from some random graph model. We show +that the output converges to a constant function, which upper-bounds what these +classifiers can uniformly express. This strong convergence phenomenon applies +to a very wide class of GNNs, including state of the art models, with +aggregates including mean and the attention-based mechanism of graph +transformers. Our results apply to a broad class of random graph models, +including sparse and dense variants of the Erd\H{o}s-R\'enyi model, the +stochastic block model, and the Barab\'asi-Albert model. We empirically +validate these findings, observing that the convergence phenomenon appears not +only on random graphs but also on some real-world graphs. + +
+
+ comment: 9 body pages, 28 appendix pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Monoidal bicategories, differential linear logic, and analytic functors + + +
+ We develop further the theory of monoidal bicategories by introducing and +studying bicategorical counterparts of the notions of a linear explonential +comonad, as considered in the study of linear logic, and of a codereliction +transformation, introduced to study differential linear logic via differential +categories. As an application, we extend the differential calculus of Joyal's +analytic functors to analytic functors between presheaf categories, just as +ordinary calculus extends from a single variable to many variables. + +
+
+ comment: v2: fixed typos, added references. 46 pages. Comments welcome +
+
+
+
+
+ + ♻ ☆ A Profunctorial Semantics for Quantum Supermaps + + +
+ We identify morphisms of strong profunctors as a categorification of quantum +supermaps. These black-box generalisations of diagrams-with-holes are hence +placed within the broader field of profunctor optics, as morphisms in the +category of copresheaves on concrete networks. This enables the first +construction of abstract logical connectives such as tensor products and +negations for supermaps in a totally theory-independent setting. These logical +connectives are found to be all that is needed to abstractly model the key +structural features of the quantum theory of supermaps: black-box indefinite +causal order, black-box definite causal order, and the factorisation of +definitely causally ordered supermaps into concrete circuit diagrams. We +demonstrate that at the heart of these factorisation theorems lies the Yoneda +lemma and the notion of representability. + +
+
+
+
+
+ + ♻ ☆ A Truly Concurrent Semantics for Reversible CCS + + +
+ Reversible CCS (RCCS) is a well-established, formal model for reversible +communicating systems, which has been built on top of the classical Calculus of +Communicating Systems (CCS). In its original formulation, each CCS process is +equipped with a memory that records its performed actions, which is then used +to reverse computations. More recently, abstract models for RCCS have been +proposed in the literature, basically, by directly associating RCCS processes +with (reversible versions of) event structures. In this paper we propose a +different abstract model: starting from one of the well-known encoding of CCS +into Petri nets we apply a recently proposed approach to incorporate +causally-consistent reversibility to Petri nets, obtaining as result the +(reversible) net counterpart of every RCCS term. + +
+
+
+
+
+ + ♻ ☆ Tractable and Intractable Entailment Problems in Separation Logic with + Inductively Defined Predicates + + +
+ We establish various complexity results for the entailment problem between +formulas in Separation Logic with user-defined predicates denoting recursive +data structures. The considered fragments are characterized by syntactic +conditions on the inductive rules that define the semantics of the predicates. +We focus on so-called P-rules, which are similar to (but simpler than) the PCE +rules introduced by Iosif et al. in 2013. In particular, for a specific +fragment where predicates are defined by so-called loc-deterministic inductive +rules, we devise a sound and complete cyclic proof procedure running in +polynomial time. Several complexity lower bounds are provided, showing that any +relaxing of the provided conditions makes the problem intractable. + +
+
+
+
+
+ + ♻ ☆ Transpension: The Right Adjoint to the Pi-type + + +
+ Presheaf models of dependent type theory have been successfully applied to +model HoTT, parametricity, and directed, guarded and nominal type theory. There +has been considerable interest in internalizing aspects of these presheaf +models, either to make the resulting language more expressive, or in order to +carry out further reasoning internally, allowing greater abstraction and +sometimes automated verification. While the constructions of presheaf models +largely follow a common pattern, approaches towards internalization do not. +Throughout the literature, various internal presheaf operators ($\surd$, +$\Phi/\mathsf{extent}$, $\Psi/\mathsf{Gel}$, $\mathsf{Glue}$, $\mathsf{Weld}$, +$\mathsf{mill}$, the strictness axiom and locally fresh names) can be found and +little is known about their relative expressivenes. Moreover, some of these +require that variables whose type is a shape (representable presheaf, e.g. an +interval) be used affinely. + We propose a novel type former, the transpension type, which is right adjoint +to universal quantification over a shape. Its structure resembles a dependent +version of the suspension type in HoTT. We give general typing rules and a +presheaf semantics in terms of base category functors dubbed multipliers. +Structural rules for shape variables and certain aspects of the transpension +type depend on characteristics of the multiplier. We demonstrate how the +transpension type and the strictness axiom can be combined to implement all and +improve some of the aforementioned internalization operators (without formal +claim in the case of locally fresh names). + +
+
+ comment: 54 pages, 12 figures. Removed tick and lockless notation, changed + most terminology (dictionary available), other tweaks +
+
+
+
+
+ + ♻ ☆ The Transpension Type: Technical Report + + +
+ The purpose of these notes is to give a categorical semantics for the +transpension type (Nuyts and Devriese, Transpension: The Right Adjoint to the +Pi-type, Accepted at LMCS, 2024), which is right adjoint to a potentially +substructural dependent function type. In section 2 we discuss some +prerequisites. In section 3, we define multipliers and discuss their +properties. In section 4, we study how multipliers lift from base categories to +presheaf categories. In section 5, we explain how typical presheaf modalities +can be used in the presence of the transpension type. In section 6, we study +commutation properties of prior modalities, substitution modalities and +multiplier modalities. + +
+
+ comment: 47 pages, 1 figure. Changes: Mainly change in terminology +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Hardware Architecturea 4 + +
+
+
+ + ☆ Memory Scraping Attack on Xilinx FPGAs: Private Data Extraction from + Terminated Processes + + +
+ FPGA-based hardware accelerators are becoming increasingly popular due to +their versatility, customizability, energy efficiency, constant latency, and +scalability. FPGAs can be tailored to specific algorithms, enabling efficient +hardware implementations that effectively leverage algorithm parallelism. This +can lead to significant performance improvements over CPUs and GPUs, +particularly for highly parallel applications. For example, a recent study +found that Stratix 10 FPGAs can achieve up to 90\% of the performance of a +TitanX Pascal GPU while consuming less than 50\% of the power. This makes FPGAs +an attractive choice for accelerating machine learning (ML) workloads. However, +our research finds privacy and security vulnerabilities in existing Xilinx +FPGA-based hardware acceleration solutions. These vulnerabilities arise from +the lack of memory initialization and insufficient process isolation, which +creates potential avenues for unauthorized access to private data used by +processes. To illustrate this issue, we conducted experiments using a Xilinx +ZCU104 board running the PetaLinux tool from Xilinx. We found that PetaLinux +does not effectively clear memory locations associated with a terminated +process, leaving them vulnerable to memory scraping attack (MSA). This paper +makes two main contributions. The first contribution is an attack methodology +of using the Xilinx debugger from a different user space. We find that we are +able to access process IDs, virtual address spaces, and pagemaps of one user +from a different user space because of lack of adequate process isolation. The +second contribution is a methodology for characterizing terminated processes +and accessing their private data. We illustrate this on Xilinx ML application +library. + +
+
+
+
+
+ + ☆ Carbon Connect: An Ecosystem for Sustainable Computing + + +
+ Computing is at a moment of profound opportunity. Emerging applications -- +such as capable artificial intelligence, immersive virtual realities, and +pervasive sensor systems -- drive unprecedented demand for computer. Despite +recent advances toward net zero carbon emissions, the computing industry's +gross energy usage continues to rise at an alarming rate, outpacing the growth +of new energy installations and renewable energy deployments. A shift towards +sustainability is needed to spark a transformation in how computer systems are +manufactured, allocated, and consumed. + Carbon Connect envisions coordinated research thrusts that produce design and +management strategies for sustainable, next-generation computer systems. These +strategies must flatten and then reverse growth trajectories for computing +power and carbon for society's most rapidly growing applications such as +artificial intelligence and virtual spaces. We will require accurate models for +carbon accounting in computing technology. For embodied carbon, we must +re-think conventional design strategies -- over-provisioned monolithic servers, +frequent hardware refresh cycles, custom silicon -- and adopt life-cycle design +strategies that more effectively reduce, reuse and recycle hardware at scale. +For operational carbon, we must not only embrace renewable energy but also +design systems to use that energy more efficiently. Finally, new hardware +design and management strategies must be cognizant of economic policy and +regulatory landscape, aligning private initiatives with societal goals. Many of +these broader goals will require computer scientists to develop deep, enduring +collaborations with researchers in economics, law, and industrial ecology to +spark change in broader practice. + +
+
+
+
+
+ + ☆ Time-Series Forecasting and Sequence Learning Using Memristor-based + Reservoir System + + +
+ Pushing the frontiers of time-series information processing in ever-growing +edge devices with stringent resources has been impeded by the system's ability +to process information and learn locally on the device. Local processing and +learning typically demand intensive computations and massive storage as the +process involves retrieving information and tuning hundreds of parameters back +in time. In this work, we developed a memristor-based echo state network +accelerator that features efficient temporal data processing and in-situ online +learning. The proposed design is benchmarked using various datasets involving +real-world tasks, such as forecasting the load energy consumption and weather +conditions. The experimental results illustrate that the hardware model +experiences a marginal degradation (~4.8%) in performance as compared to the +software model. This is mainly attributed to the limited precision and dynamic +range of network parameters when emulated using memristor devices. The proposed +system is evaluated for lifespan, robustness, and energy-delay product. It is +observed that the system demonstrates a reasonable robustness for device +failure below 10%, which may occur due to stuck-at faults. Furthermore, 246X +reduction in energy consumption is achieved when compared to a custom CMOS +digital design implemented at the same technology node. + +
+
+
+
+
+ + ♻ Automatic Hardware Pragma Insertion in High-Level Synthesis: A + Non-Linear Programming Approach + + +
+ High-Level Synthesis enables the rapid prototyping of hardware accelerators, +by combining a high-level description of the functional behavior of a kernel +with a set of micro-architecture optimizations as inputs. Such optimizations +can be described by inserting pragmas for e.g. pipelining and replication of +units, or even higher level transformations for HLS such as automatic data +caching using the AMD/Xilinx Merlin compiler. Selecting the best combination of +pragmas, even within a restricted set, remains particularly challenging and the +typical state-of-practice uses design-space exploration to navigate this space. +But due to the highly irregular performance distribution of pragma +configurations, typical DSE approaches are either extremely time consuming, or +operating on a severely restricted search space. + In this work we propose a framework to automatically insert HLS pragmas in +regular loop-based programs, supporting pipelining, unit replication (coarse- +and fine-grain), and data caching. We develop an analytical performance and +resource model as a function of the input program properties and pragmas +inserted, using non-linear constraints and objectives. We prove this model +provides a lower bound on the actual performance after HLS. We then encode this +model as a Non-Linear Program, by making the pragma configuration unknowns of +the system, which is computed optimally by solving this NLP. This approach can +also be used during DSE, to quickly prune points with a (possibly partial) +pragma configuration, driven by lower bounds on achievable latency. We +extensively evaluate our end-to-end, fully implemented system, showing it can +effectively manipulate spaces of billions of designs in seconds to minutes for +the kernels evaluated. + +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 23 + +
+
+
+ + ☆ SlipStream: Adapting Pipelines for Distributed Training of Large DNNs + Amid Failures + + +
+ Training large Deep Neural Network (DNN) models requires thousands of GPUs +for days or weeks at a time. At these scales, failures are frequent and can +have a big impact on training throughput. Restoring performance using spare GPU +servers becomes increasingly expensive as models grow. SlipStream is a system +for efficient DNN training in the presence of failures, without using spare +servers. It exploits the functional redundancy inherent in distributed training +systems -- servers hold the same model parameters across data-parallel groups +-- as well as the bubbles in the pipeline schedule within each data-parallel +group. SlipStream dynamically re-routes the work of a failed server to its +data-parallel peers, ensuring continuous training despite multiple failures. +However, re-routing work leads to imbalances across pipeline stages that +degrades training throughput. SlipStream introduces two optimizations that +allow re-routed work to execute within bubbles of the original pipeline +schedule. First, it decouples the backward pass computation into two phases. +Second, it staggers the execution of the optimizer step across pipeline stages. +Combined, these optimizations enable schedules that minimize or even eliminate +training throughput degradation during failures. We describe a prototype for +SlipStream and show that it achieves high training throughput under multiple +failures, outperforming recent proposals for fault-tolerant training such as +Oobleck and Bamboo by up to 1.46x and 1.64x, respectively. + +
+
+
+
+
+ + ☆ SADDLe: Sharpness-Aware Decentralized Deep Learning with Heterogeneous + Data + + +
+ Decentralized training enables learning with distributed datasets generated +at different locations without relying on a central server. In realistic +scenarios, the data distribution across these sparsely connected learning +agents can be significantly heterogeneous, leading to local model over-fitting +and poor global model generalization. Another challenge is the high +communication cost of training models in such a peer-to-peer fashion without +any central coordination. In this paper, we jointly tackle these two-fold +practical challenges by proposing SADDLe, a set of sharpness-aware +decentralized deep learning algorithms. SADDLe leverages Sharpness-Aware +Minimization (SAM) to seek a flatter loss landscape during training, resulting +in better model generalization as well as enhanced robustness to communication +compression. We present two versions of our approach and conduct extensive +experiments to show that SADDLe leads to 1-20% improvement in test accuracy +compared to other existing techniques. Additionally, our proposed approach is +robust to communication compression, with an average drop of only 1% in the +presence of up to 4x compression. + +
+
+
+
+
+ + ☆ FACT or Fiction: Can Truthful Mechanisms Eliminate Federated Free + Riding? + + +
+ Standard federated learning (FL) approaches are vulnerable to the free-rider +dilemma: participating agents can contribute little to nothing yet receive a +well-trained aggregated model. While prior mechanisms attempt to solve the +free-rider dilemma, none have addressed the issue of truthfulness. In practice, +adversarial agents can provide false information to the server in order to +cheat its way out of contributing to federated training. In an effort to make +free-riding-averse federated mechanisms truthful, and consequently less prone +to breaking down in practice, we propose FACT. FACT is the first federated +mechanism that: (1) eliminates federated free riding by using a penalty system, +(2) ensures agents provide truthful information by creating a competitive +environment, and (3) encourages agent participation by offering better +performance than training alone. Empirically, FACT avoids free-riding when +agents are untruthful, and reduces agent loss by over 4x. + +
+
+ comment: 18 pages, 5 figures +
+
+
+
+
+ + ☆ Carbon Connect: An Ecosystem for Sustainable Computing + + +
+ Computing is at a moment of profound opportunity. Emerging applications -- +such as capable artificial intelligence, immersive virtual realities, and +pervasive sensor systems -- drive unprecedented demand for computer. Despite +recent advances toward net zero carbon emissions, the computing industry's +gross energy usage continues to rise at an alarming rate, outpacing the growth +of new energy installations and renewable energy deployments. A shift towards +sustainability is needed to spark a transformation in how computer systems are +manufactured, allocated, and consumed. + Carbon Connect envisions coordinated research thrusts that produce design and +management strategies for sustainable, next-generation computer systems. These +strategies must flatten and then reverse growth trajectories for computing +power and carbon for society's most rapidly growing applications such as +artificial intelligence and virtual spaces. We will require accurate models for +carbon accounting in computing technology. For embodied carbon, we must +re-think conventional design strategies -- over-provisioned monolithic servers, +frequent hardware refresh cycles, custom silicon -- and adopt life-cycle design +strategies that more effectively reduce, reuse and recycle hardware at scale. +For operational carbon, we must not only embrace renewable energy but also +design systems to use that energy more efficiently. Finally, new hardware +design and management strategies must be cognizant of economic policy and +regulatory landscape, aligning private initiatives with societal goals. Many of +these broader goals will require computer scientists to develop deep, enduring +collaborations with researchers in economics, law, and industrial ecology to +spark change in broader practice. + +
+
+
+
+
+ + ☆ MPI Progress For All + + +
+ The progression of communication in the Message Passing Interface (MPI) is +not well defined, yet it is critical for application performance, particularly +in achieving effective computation and communication overlap. The opaque nature +of MPI progress poses significant challenges in advancing MPI within modern +high-performance computing (HPC) practices. Firstly, the lack of clarity +hinders the development of explicit guidelines for enhancing computation and +communication overlap in applications. Secondly, it prevents MPI from +seamlessly integrating with contemporary programming paradigms, such as +task-based runtimes and event-driven programming. Thirdly, it limits the +extension of MPI functionalities from the user space. In this paper, we examine +the role of MPI progress by analyzing the implementation details of MPI +messaging. We then generalize the asynchronous communication pattern and +identify key factors influencing application performance. Based on this +analysis, we propose a set of MPI extensions designed to enable users to +explicitly construct and manage an efficient progress engine. We provide +example codes to demonstrate the use of these proposed APIs in achieving +improved performance, adapting MPI to task-based or event-driven programming +styles, and constructing collective algorithms that rival the performance of +native implementations. Our approach is compared to previous efforts in the +field, highlighting its reduced complexity and increased effectiveness. + +
+
+ comment: Submitting to EuroMPI'24 +
+
+
+
+
+ + ☆ CG-FedLLM: How to Compress Gradients in Federated Fune-tuning for Large + Language Models + + +
+ The success of current Large-Language Models (LLMs) hinges on extensive +training data that is collected and stored centrally, called Centralized +Learning (CL). However, such a collection manner poses a privacy threat, and +one potential solution is Federated Learning (FL), which transfers gradients, +not raw data, among clients. Unlike traditional networks, FL for LLMs incurs +significant communication costs due to their tremendous parameters. This study +introduces an innovative approach to compress gradients to improve +communication efficiency during LLM FL, formulating the new FL pipeline named +CG-FedLLM. This approach integrates an encoder on the client side to acquire +the compressed gradient features and a decoder on the server side to +reconstruct the gradients. We also developed a novel training strategy that +comprises Temporal-ensemble Gradient-Aware Pre-training (TGAP) to identify +characteristic gradients of the target model and Federated AutoEncoder-Involved +Fine-tuning (FAF) to compress gradients adaptively. Extensive experiments +confirm that our approach reduces communication costs and improves performance +(e.g., average 3 points increment compared with traditional CL- and FL-based +fine-tuning with LlaMA on a well-recognized benchmark, C-Eval). This +improvement is because our encoder-decoder, trained via TGAP and FAF, can +filter gradients while selectively preserving critical features. Furthermore, +we present a series of experimental analyses focusing on the signal-to-noise +ratio, compression rate, and robustness within this privacy-centric framework, +providing insight into developing more efficient and secure LLMs. + +
+
+
+
+
+ + ☆ Total cost of ownership and evaluation of Google cloud resources for the + ATLAS experiment at the LHC + + +
+ The ATLAS Google Project was established as part of an ongoing evaluation of +the use of commercial clouds by the ATLAS Collaboration, in anticipation of the +potential future adoption of such resources by WLCG grid sites to fulfil or +complement their computing pledges. Seamless integration of Google cloud +resources into the worldwide ATLAS distributed computing infrastructure was +achieved at large scale and for an extended period of time, and hence cloud +resources are shown to be an effective mechanism to provide additional, +flexible computing capacity to ATLAS. For the first time a total cost of +ownership analysis has been performed, to identify the dominant cost drivers +and explore effective mechanisms for cost control. Network usage significantly +impacts the costs of certain ATLAS workflows, underscoring the importance of +implementing such mechanisms. Resource bursting has been successfully +demonstrated, whilst exposing the true cost of this type of activity. A +follow-up to the project is underway to investigate methods for improving the +integration of cloud resources in data-intensive distributed computing +environments and reducing costs related to network connectivity, which +represents the primary expense when extensively utilising cloud resources. + +
+
+ comment: 48 pages in total, author list starting page 31, 7 figures, 1 table, + submitted to Computing and Software for Big Science. All figures including + auxiliary figures are available at + https://atlas.web.cern.ch/Atlas/GROUPS/PHYSICS/PAPERS/SOFT-2023-02/ +
+
+
+
+
+ + ☆ An optimal algorithm for geodesic mutual visibility on hexagonal grids + + +
+ For a set of robots (or agents) moving in a graph, two properties are highly +desirable: confidentiality (i.e., a message between two agents must not pass +through any intermediate agent) and efficiency (i.e., messages are delivered +through shortest paths). These properties can be obtained if the +\textsc{Geodesic Mutual Visibility} (GMV, for short) problem is solved: +oblivious robots move along the edges of the graph, without collisions, to +occupy some vertices that guarantee they become pairwise geodesic mutually +visible. This means there is a shortest path (i.e., a ``geodesic'') between +each pair of robots along which no other robots reside. In this work, we +optimally solve GMV on finite hexagonal grids $G_k$. This, in turn, requires +first solving a graph combinatorial problem, i.e. determining the maximum +number of mutually visible vertices in $G_k$. + +
+
+ comment: 24 pages, 13 figures +
+
+
+
+
+ + ☆ GeoFF: Federated Serverless Workflows with Data Pre-Fetching + + +
+ Function-as-a-Service (FaaS) is a popular cloud computing model in which +applications are implemented as work flows of multiple independent functions. +While cloud providers usually offer composition services for such workflows, +they do not support cross-platform workflows forcing developers to hardcode the +composition logic. Furthermore, FaaS workflows tend to be slow due to cascading +cold starts, inter-function latency, and data download latency on the critical +path. In this paper, we propose GeoFF, a serverless choreography middleware +that executes FaaS workflows across different public and private FaaS +platforms, including ad-hoc workflow recomposition. Furthermore, GeoFF supports +function pre-warming and data pre-fetching. This minimizes end-to-end workflow +latency by taking cold starts and data download latency off the critical path. +In experiments with our proof-of-concept prototype and a realistic application, +we were able to reduce end-to-end latency by more than 50%. + +
+
+
+
+
+ + ☆ Emulating Full Client Participation: A Long-Term Client Selection + Strategy for Federated Learning + + +
+ Client selection significantly affects the system convergence efficiency and +is a crucial problem in federated learning. Existing methods often select +clients by evaluating each round individually and overlook the necessity for +long-term optimization, resulting in suboptimal performance and potential +fairness issues. In this study, we propose a novel client selection strategy +designed to emulate the performance achieved with full client participation. In +a single round, we select clients by minimizing the gradient-space estimation +error between the client subset and the full client set. In multi-round +selection, we introduce a novel individual fairness constraint, which ensures +that clients with similar data distributions have similar frequencies of being +selected. This constraint guides the client selection process from a long-term +perspective. We employ Lyapunov optimization and submodular functions to +efficiently identify the optimal subset of clients, and provide a theoretical +analysis of the convergence ability. Experiments demonstrate that the proposed +strategy significantly improves both accuracy and fairness compared to previous +methods while also exhibiting efficiency by incurring minimal time overhead. + +
+
+
+
+
+ + ☆ ElastiBench: Scalable Continuous Benchmarking on Cloud FaaS Platforms + + +
+ Running microbenchmark suites often and early in the development process +enables developers to identify performance issues in their application. +Microbenchmark suites of complex applications can comprise hundreds of +individual benchmarks and take multiple hours to evaluate meaningfully, making +running those benchmarks as part of CI/CD pipelines infeasible. In this paper, +we reduce the total execution time of microbenchmark suites by leveraging the +massive scalability and elasticity of FaaS (Function-as-a-Service) platforms. +While using FaaS enables users to quickly scale up to thousands of parallel +function instances to speed up microbenchmarking, the performance variation and +low control over the underlying computing resources complicate reliable +benchmarking. We demonstrate an architecture for executing microbenchmark +suites on cloud FaaS platforms and evaluate it on code changes from an +open-source time series database. Our evaluation shows that our prototype can +produce reliable results (~95% of performance changes accurately detected) in a +quarter of the time (<=15min vs.~4h) and at lower cost ($0.49 vs. ~$1.18) +compared to cloud-based virtual machines. + +
+
+
+
+
+ + ☆ Building a Verifiable Logical Clock for P2P Networks + + +
+ Logical clocks are a fundamental tool to establish causal ordering of events +in a distributed system. They have been applied in weakly consistent storage +systems, causally ordered broadcast, distributed snapshots, deadlock detection, +and distributed system debugging. However, prior logical clock constructs fail +to work in an open network with Byzantine participants. In this work, we +present Chrono, a novel logical clock system that targets such challenging +environment. We first redefine causality properties among distributed processes +under the Byzantine failure model. To enforce these properties, Chrono defines +a new validator abstraction for building fault-tolerant logical clocks. +Furthermore, our validator abstraction is customizable: Chrono includes +multiple backend implementations for the abstraction, each with different +security-performance trade-offs. We have applied Chrono to build two +decentralized applications, a mutual exclusive service and a weakly consistent +key-value store. Chrono adds only marginal overhead compared to systems that +tolerate no Byzantine faults. It also out-performs state-of-the-art BFT total +order protocols by significant margins. + +
+
+
+
+
+ + ♻ ☆ AdaptSFL: Adaptive Split Federated Learning in Resource-constrained Edge + Networks + + +
+ The increasing complexity of deep neural networks poses significant barriers +to democratizing them to resource-limited edge devices. To address this +challenge, split federated learning (SFL) has emerged as a promising solution +by of floading the primary training workload to a server via model partitioning +while enabling parallel training among edge devices. However, although system +optimization substantially influences the performance of SFL under +resource-constrained systems, the problem remains largely uncharted. In this +paper, we provide a convergence analysis of SFL which quantifies the impact of +model splitting (MS) and client-side model aggregation (MA) on the learning +performance, serving as a theoretical foundation. Then, we propose AdaptSFL, a +novel resource-adaptive SFL framework, to expedite SFL under +resource-constrained edge computing systems. Specifically, AdaptSFL adaptively +controls client-side MA and MS to balance communication-computing latency and +training convergence. Extensive simulations across various datasets validate +that our proposed AdaptSFL framework takes considerably less time to achieve a +target accuracy than benchmarks, demonstrating the effectiveness of the +proposed strategies. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Cache Blocking of Distributed-Memory Parallel Matrix Power Kernels + + +
+ Sparse matrix-vector products (SpMVs) are a bottleneck in many scientific +codes. Due to the heavy strain on the main memory interface from loading the +sparse matrix and the possibly irregular memory access pattern, SpMV typically +exhibits low arithmetic intensity. Repeating these products multiple times with +the same matrix is required in many algorithms. This so-called matrix power +kernel (MPK) provides an opportunity for data reuse since the same matrix data +is loaded from main memory multiple times, an opportunity that has only +recently been exploited successfully with the Recursive Algebraic Coloring +Engine (RACE). Using RACE, one considers a graph based formulation of the SpMV +and employs s level-based implementation of SpMV for reuse of relevant matrix +data. However, the underlying data dependencies have restricted the use of this +concept to shared memory parallelization and thus to single compute nodes. +Enabling cache blocking for distributed-memory parallelization of MPK is +challenging due to the need for explicit communication and synchronization of +data in neighboring levels. In this work, we propose and implement a flexible +method that interleaves the cache-blocking capabilities of RACE with an MPI +communication scheme that fulfills all data dependencies among processes. +Compared to a "traditional" distributed memory parallel MPK, our new +Distributed Level-Blocked MPK yields substantial speed-ups on modern Intel and +AMD architectures across a wide range of sparse matrices from various +scientific applications. Finally, we address a modern quantum physics problem +to demonstrate the applicability of our method, achieving a speed-up of up to +4x on 832 cores of an Intel Sapphire Rapids cluster. + +
+
+ comment: 15 pages, 12 figures, 5 tables; added affiliation & extended + acknowledgment +
+
+
+
+
+ + ♻ ☆ Improved Approximation Bounds for Minimum Weight Cycle in the CONGEST + Model + + +
+ Minimum Weight Cycle (MWC) is the problem of finding a simple cycle of +minimum weight in a graph $G=(V,E)$. This is a fundamental graph problem with +classical sequential algorithms that run in $\tilde{O}(n^3)$ and +$\tilde{O}(mn)$ time where $n=|V|$ and $m=|E|$. In recent years this problem +has received significant attention in the context of fine-grained sequential +complexity as well as in the design of faster sequential approximation +algorithms, though not much is known in the distributed CONGEST model. + We present sublinear-round approximation algorithms for computing MWC in +directed graphs, and weighted graphs. Our algorithms use a variety of +techniques in non-trivial ways, such as in our approximate directed unweighted +MWC algorithm that efficiently computes BFS from all vertices restricted to +certain implicitly computed neighborhoods in sublinear rounds, and in our +weighted approximation algorithms that use unweighted MWC algorithms on scaled +graphs combined with a fast and streamlined method for computing multiple +source approximate SSSP. We present $\tilde{\Omega}(\sqrt{n})$ lower bounds for +arbitrary constant factor approximation of MWC in directed graphs and +undirected weighted graphs. + +
+
+ comment: To appear in PODC 2024 +
+
+
+
+
+ + ♻ ☆ SCARIF: Towards Carbon Modeling of Cloud Servers with Accelerators + + +
+ Embodied carbon has been widely reported as a significant component in the +full system lifecycle of various computing systems' green house gas emissions. +Many efforts have been undertaken to quantify the elements that comprise this +embodied carbon, from tools that evaluate semiconductor manufacturing to those +that can quantify different elements of the computing system from commercial +and academic sources. However, these tools cannot easily reproduce results +reported by server vendors' product carbon reports and the accuracy can vary +substantially due to various assumptions. Furthermore, attempts to determine +green house gas contributions using bottom-up methodologies often do not agree +with system-level studies and are hard to rectify. Nonetheless, given there is +a need to consider all contributions to green house gas emissions in +datacenters, we propose SCARIF, the Server Carbon including Accelerator +Reporter with Intelligence-based Formulation tool. SCARIF has three main +contributions: (1) We first collect reported carbon cost data from server +vendors and design statistic models to predict the embodied carbon cost so that +users can get the embodied carbon cost for their server configurations. (2) We +provide embodied carbon cost if users configure servers with accelerators +including GPUs, and FPGAs. (3) By using case studies, we show that certain +design choices of data center management might flip by the insight and +observation from using SCARIF. Thus, SCARIF provides an opportunity for +large-scale datacenter and hyperscaler design. We release SCARIF as an +open-source tool at https://github.com/arc-research-lab/SCARIF. + +
+
+ comment: 6 pages; 6 figures; 3 tables. Accepted by ISVLSI' 24 +
+
+
+
+
+ + ♻ ☆ Energy-efficient Decentralized Learning via Graph Sparsification SP 2024 + + +
+ This work aims at improving the energy efficiency of decentralized learning +by optimizing the mixing matrix, which controls the communication demands +during the learning process. Through rigorous analysis based on a +state-of-the-art decentralized learning algorithm, the problem is formulated as +a bi-level optimization, with the lower level solved by graph sparsification. A +solution with guaranteed performance is proposed for the special case of +fully-connected base topology and a greedy heuristic is proposed for the +general case. Simulations based on real topology and dataset show that the +proposed solution can lower the energy consumption at the busiest node by +54%-76% while maintaining the quality of the trained model. + +
+
+ comment: ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Low-Bandwidth Matrix Multiplication: Faster Algorithms and More General + Forms of Sparsity + + +
+ In prior work, Gupta et al. (SPAA 2022) presented a distributed algorithm for +multiplying sparse $n \times n$ matrices, using $n$ computers. They assumed +that the input matrices are uniformly sparse--there are at most $d$ non-zeros +in each row and column--and the task is to compute a uniformly sparse part of +the product matrix. The sparsity structure is globally known in advance (this +is the supported setting). As input, each computer receives one row of each +input matrix, and each computer needs to output one row of the product matrix. +In each communication round each computer can send and receive one $O(\log +n)$-bit message. Their algorithm solves this task in $O(d^{1.907})$ rounds, +while the trivial bound is $O(d^2)$. + We improve on the prior work in two dimensions: First, we show that we can +solve the same task faster, in only $O(d^{1.832})$ rounds. Second, we explore +what happens when matrices are not uniformly sparse. We consider the following +alternative notions of sparsity: row-sparse matrices (at most $d$ non-zeros per +row), column-sparse matrices, matrices with bounded degeneracy (we can +recursively delete a row or column with at most $d$ non-zeros), average-sparse +matrices (at most $dn$ non-zeros in total), and general matrices. + +
+
+
+
+
+ + ♻ ☆ Initialisation and Topology Effects in Decentralised Federated Learning + + +
+ Fully decentralised federated learning enables collaborative training of +individual machine learning models on distributed devices on a communication +network while keeping the training data localised. This approach enhances data +privacy and eliminates both the single point of failure and the necessity for +central coordination. Our research highlights that the effectiveness of +decentralised federated learning is significantly influenced by the network +topology of connected devices. We propose a strategy for uncoordinated +initialisation of the artificial neural networks, which leverages the +distribution of eigenvector centralities of the nodes of the underlying +communication network, leading to a radically improved training efficiency. +Additionally, our study explores the scaling behaviour and choice of +environmental parameters under our proposed initialisation strategy. This work +paves the way for more efficient and scalable artificial neural network +training in a distributed and uncoordinated environment, offering a deeper +understanding of the intertwining roles of network structure and learning +dynamics. + +
+
+
+
+
+ + ♻ ☆ Towards Realistic Mechanisms That Incentivize Federated Participation + and Contribution + + +
+ Edge device participation in federating learning (FL) is typically studied +through the lens of device-server communication (e.g., device dropout) and +assumes an undying desire from edge devices to participate in FL. As a result, +current FL frameworks are flawed when implemented in realistic settings, with +many encountering the free-rider dilemma. In a step to push FL towards +realistic settings, we propose RealFM: the first federated mechanism that (1) +realistically models device utility, (2) incentivizes data contribution and +device participation, (3) provably removes the free-rider dilemma, and (4) +relaxes assumptions on data homogeneity and data sharing. Compared to previous +FL mechanisms, RealFM allows for a non-linear relationship between model +accuracy and utility, which improves the utility gained by the server and +participating devices. On real-world data, RealFM improves device and server +utility, as well as data contribution, by over 3 and 4 magnitudes respectively +compared to baselines. + +
+
+ comment: 24 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Agent-based Leader Election, MST, and Beyond + + +
+ Leader election is one of the fundamental and well-studied problems in +distributed computing. In this paper, we initiate the study of leader election +using mobile agents. Suppose $n$ agents are positioned initially arbitrarily on +the nodes of an arbitrary, anonymous, $n$-node, $m$-edge graph $G$. The agents +relocate themselves autonomously on the nodes of $G$ and elect an agent as a +leader such that the leader agent knows it is a leader and the other agents +know they are not leaders. The objective is to minimize time and memory +requirements. Following the literature, we consider the synchronous setting in +which each agent performs its operations synchronously with others and hence +the time complexity can be measured in rounds. The quest in this paper is to +provide solutions without agents knowing any graph parameter, such as $n$, a +priori. We first establish that, without agents knowing any graph parameter a +priori, there exists a deterministic algorithm to elect an agent as a leader in +$O(m)$ rounds with $O(n\log n)$ bits at each agent. Using this leader election +result, we develop a deterministic algorithm for agents to construct a minimum +spanning tree of $G$ in $O(m+n\log n)$ rounds using $O(n \log n)$ bits memory +at each agent, without agents knowing any graph parameter a priori. Finally, +using the same leader election result, we provide improved time/memory results +for other fundamental distributed graph problems, namely, gathering, maximal +independent set, and minimal dominating sets, removing the assumptions on +agents knowing graph parameters a priori. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ Data Transmissions in Blockchain enabled AGVs + + +
+ Automated Guided Vehicles (AGVs) operate in synergy to execute specific +tasks. These vehicles exchange information to ensure seamless collaboration, +prevent collisions, and eliminate task redundancy. The advent of blockchain +technology offers a promising avenue for establishing a secure and dependable +communication infrastructure for AGVs. Nonetheless, it becomes imperative for +AGVs to adopt efficient data transmission methodologies, especially when +interacting with the dynamic nature of blockchain infrastructure where data +undergoes frequent modifications. + In the present study, we introduce a novel data transmission methodology +tailored for blockchain-integrated AGVs utilizing the principles of Named Data +Networking (NDN). A simulated environment was crafted and executed in NetSim, +wherein multiple AGVs collaboratively endeavored to locate concealed objectives +within a defined region. Upon discovery of novel elements, such as obstructions +or concealed objectives, each AGV would update a collective blockchain +repository. This blockchain infrastructure leverages NDN to fetch specific data +blocks in response to data queries from individual AGVs. This system ensures +that AGVs can navigate and scrutinize their environment with heightened +efficiency, drawing upon the collective intelligence and shared experiences of +the fleet. + +
+
+ comment: The content of this paper needs significant improvement. According to + the requirements of the supervising professor, we need to withdraw it first +
+
+
+
+
+ + ♻ ☆ All-to-all reconfigurability with sparse and higher-order Ising machines + + +
+ Domain-specific hardware to solve computationally hard optimization problems +has generated tremendous excitement recently. Here, we evaluate probabilistic +bit (p-bit) based on Ising Machines (IM) or p-computers with a benchmark +combinatorial optimization problem, namely the 3-regular 3-XOR Satisfiability +(3R3X). The 3R3X problem has a glassy energy landscape, and it has recently +been used to benchmark various IMs and other solvers. We introduce a +multiplexed architecture where p-computers emulate all-to-all (complete) graph +functionality despite being interconnected in sparse networks, enabling a +highly parallelized chromatic Gibbs sampling. We implement this architecture in +FPGAs and show that p-bit networks running an adaptive version of the powerful +parallel tempering algorithm demonstrate competitive algorithmic and prefactor +advantages over alternative IMs by D-Wave, Toshiba, and Fujitsu, except a +greedy algorithm accelerated on a GPU. We further extend our APT results using +higher-order interactions in FPGAs and show that while higher-order +interactions lead to prefactor advantages, they do not show any algorithmic +scaling advantages for the XORSAT problem, settling an open conjecture. Even +though FPGA implementations of p-bits are still not quite as fast as the best +possible greedy algorithms implemented in GPUs, scaled magnetic versions of +p-computers could lead to orders of magnitude over such algorithms according to +experimentally established projections. + +
+
+ comment: First three authors are equally contributing +
+
+
+
+
+
+
+
+ + Programming and Languages 4 + +
+
+
+ + ☆ Verifying Cake-Cutting, Faster + + +
+ Envy-free cake-cutting protocols procedurally divide an infinitely divisible +good among a set of agents so that no agent prefers another's allocation to +their own. These protocols are highly complex and difficult to prove correct. +Recently, Bertram, Levinson, and Hsu introduced a language called Slice for +describing and verifying cake-cutting protocols. Slice programs can be +translated to formulas encoding envy-freeness, which are solved by SMT. While +Slice works well on smaller protocols, it has difficulty scaling to more +complex cake-cutting protocols. + We improve Slice in two ways. First, we show any protocol execution in Slice +can be replicated using piecewise uniform valuations. We then reduce Slice's +constraint formulas to formulas within the theory of linear real arithmetic, +showing that verifying envy-freeness is efficiently decidable. Second, we +design and implement a linear type system which enforces that no two agents +receive the same part of the good. We implement our methods and verify a range +of challenging examples, including the first nontrivial four-agent protocol. + +
+
+ comment: 53 Pages, 12 Figures, CAV 2024 +
+
+
+
+
+ + ☆ Verifying Lock-free Search Structure Templates + + +
+ We present and verify template algorithms for lock-free concurrent search +structures that cover a broad range of existing implementations based on lists +and skiplists. Our linearizability proofs are fully mechanized in the +concurrent separation logic Iris. The proofs are modular and cover the broader +design space of the underlying algorithms by parameterizing the verification +over aspects such as the low-level representation of nodes and the style of +data structure maintenance. As a further technical contribution, we present a +mechanization of a recently proposed method for reasoning about +future-dependent linearization points using hindsight arguments. The +mechanization builds on Iris' support for prophecy reasoning and user-defined +ghost resources. We demonstrate that the method can help to reduce the proof +effort compared to direct prophecy-based proofs. + +
+
+ comment: Extended version of an article to appear in ECOOP'24 +
+
+
+
+
+ + ♻ Enhancing High-Level Synthesis with Automated Pragma Insertion and Code + Transformation Framework + + +
+ High-level synthesis, source-to-source compilers, and various Design Space +Exploration techniques for pragma insertion have significantly improved the +Quality of Results of generated designs. These tools offer benefits such as +reduced development time and enhanced performance. However, achieving +high-quality results often requires additional manual code transformations and +tiling selections, which are typically performed separately or as +pre-processing steps. Although DSE techniques enable code transformation +upfront, the vastness of the search space often limits the exploration of all +possible code transformations, making it challenging to determine which +transformations are necessary. Additionally, ensuring correctness remains +challenging, especially for complex transformations and optimizations. + To tackle this obstacle, we first propose a comprehensive framework +leveraging HLS compilers. Our system streamlines code transformation, pragma +insertion, and tiles size selection for on-chip data caching through a unified +optimization problem, aiming to enhance parallelization, particularly +beneficial for computation-bound kernels. Them employing a novel Non-Linear +Programming (NLP) approach, we simultaneously ascertain transformations, +pragmas, and tile sizes, focusing on regular loop-based kernels. Our evaluation +demonstrates that our framework adeptly identifies the appropriate +transformations, including scenarios where no transformation is necessary, and +inserts pragmas to achieve a favorable Quality of Results. + +
+
+
+
+
+ + ♻ ☆ Lexicographic Ranking Supermartingales with Lazy Lower Bounds + + +
+ Lexicographic Ranking SuperMartingale (LexRSM) is a probabilistic extension +of Lexicographic Ranking Function (LexRF), which is a widely accepted technique +for verifying program termination. In this paper, we are the first to propose +sound probabilistic extensions of LexRF with a weaker non-negativity condition, +called single-component (SC) non-negativity. It is known that such an +extension, if it exists, will be nontrivial due to the intricacies of the +probabilistic circumstances. + Toward the goal, we first devise the notion of fixability, which offers a +systematic approach for analyzing the soundness of possibly negative LexRSM. +This notion yields a desired extension of LexRF that is sound for general +stochastic processes. We next propose another extension, called Lazy LexRSM, +toward the application to automated verification; it is sound over +probabilistic programs with linear arithmetics, while its subclass is amenable +to automated synthesis via linear programming. We finally propose a LexRSM +synthesis algorithm for this subclass, and perform experiments. + +
+
+
+
+
+
+
+
+ + Computational Complexity 8 + +
+
+
+ + ☆ On connections between k-coloring and Euclidean k-means + + +
+ In the Euclidean $k$-means problems we are given as input a set of $n$ points +in $\mathbb{R}^d$ and the goal is to find a set of $k$ points $C\subseteq +\mathbb{R}^d$, so as to minimize the sum of the squared Euclidean distances +from each point in $P$ to its closest center in $C$. In this paper, we formally +explore connections between the $k$-coloring problem on graphs and the +Euclidean $k$-means problem. Our results are as follows: + $\bullet$ For all $k\ge 3$, we provide a simple reduction from the +$k$-coloring problem on regular graphs to the Euclidean $k$-means problem. +Moreover, our technique extends to enable a reduction from a structured max-cut +problem (which may be considered as a partial 2-coloring problem) to the +Euclidean $2$-means problem. Thus, we have a simple and alternate proof of the +NP-hardness of Euclidean 2-means problem. + $\bullet$ In the other direction, we mimic the $O(1.7297^n)$ time algorithm +of Williams [TCS'05] for the max-cut of problem on $n$ vertices to obtain an +algorithm for the Euclidean 2-means problem with the same runtime, improving on +the naive exhaustive search running in $2^n\cdot \text{poly}(n,d)$ time. + $\bullet$ We prove similar results and connections as above for the Euclidean +$k$-min-sum problem. + +
+
+
+
+
+ + ☆ On the Inapproximability of Finding Minimum Monitoring Edge-Geodetic + Sets + + +
+ Given an undirected connected graph $G = (V(G), E(G))$ on $n$ vertices, the +minimum Monitoring Edge-Geodetic Set (MEG-set) problem asks to find a subset $M +\subseteq V(G)$ of minimum cardinality such that, for every edge $e \in E(G)$, +there exist $x,y \in M$ for which all shortest paths between $x$ and $y$ in $G$ +traverse $e$. + We show that, for any constant $c < \frac{1}{2}$, no polynomial-time $(c \log +n)$-approximation algorithm for the minimum MEG-set problem exists, unless +$\mathsf{P} = \mathsf{NP}$. + +
+
+
+
+
+ + ☆ Dequantizability from inputs + + +
+ By comparing constructions of block encoding given by [1-4], we propose a way +to extract dequantizability from advancements in dequantization techniques that +have been led by Tang, as in [5]. Then we apply this notion to the +sparse-access input model that is known to be BQP-complete in general, thereby +conceived to be un-dequantizable. Our goal is to break down this belief by +examining the sparse-access input model's instances, particularly their input +matrices. In conclusion, this paper forms a dequantizability-verifying scheme +that can be applied whenever an input is given. + +
+
+
+
+
+ + ♻ ☆ Real Stability and Log Concavity are coNP-Hard + + +
+ Real-stable, Lorentzian, and log-concave polynomials are well-studied classes +of polynomials, and have been powerful tools in resolving several conjectures. +We show that the problems of deciding whether a polynomial of fixed degree is +real stable or log concave are coNP-hard. On the other hand, while all +homogeneous real-stable polynomials are Lorentzian and all Lorentzian +polynomials are log concave on the positive orthant, the problem of deciding +whether a polynomial of fixed degree is Lorentzian can be solved in polynomial +time. + +
+
+ comment: 21 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Bounded Depth Frege Lower Bounds for Random 3-CNFs via Deterministic + Restrictions + + +
+ A major open problem in proof complexity is to show that random 3-CNFs with +linear number of clauses require super-polynomial size refutations in bounded +depth Frege. We make a first step towards this question by showing a +super-linear lower bound: for every $k$, there exists $\epsilon > 0$ such that +any depth-$k$ Frege refutation of a random $n$-variable 3-CNF with $\Theta(n)$ +clauses has $\Omega(n^{1 + \epsilon})$ steps w.h.p. Our proof involves a novel +adaptation of the deterministic restriction technique of Chaudhuri and +Radhakrishnan (STOC'96). + +
+
+
+
+
+ + ♻ ☆ A Strongly Polynomial-Time Algorithm for Weighted General Factors with + Three Feasible Degrees + + +
+ General factors are a generalization of matchings. Given a graph $G$ with a +set $\pi(v)$ of feasible degrees, called a degree constraint, for each vertex +$v$ of $G$, the general factor problem is to find a (spanning) subgraph $F$ of +$G$ such that $\text{deg}_F(x) \in \pi(v)$ for every $v$ of $G$. When all +degree constraints are symmetric $\Delta$-matroids, the problem is solvable in +polynomial time. The weighted general factor problem is to find a general +factor of the maximum total weight in an edge-weighted graph. In this paper, we +present the first strongly polynomial-time algorithm for a type of weighted +general factor problems with real-valued edge weights that is provably not +reducible to the weighted matching problem by gadget constructions. + +
+
+ comment: This is a full version of an ISAAC 2023 paper +
+
+
+
+
+ + ♻ ☆ Logics with probabilistic team semantics and the Boolean negation + + +
+ We study the expressivity and the complexity of various logics in +probabilistic team semantics with the Boolean negation. In particular, we study +the extension of probabilistic independence logic with the Boolean negation, +and a recently introduced logic FOPT. We give a comprehensive picture of the +relative expressivity of these logics together with the most studied logics in +probabilistic team semantics setting, as well as relating their expressivity to +a numerical variant of second-order logic. In addition, we introduce novel +entropy atoms and show that the extension of first-order logic by entropy atoms +subsumes probabilistic independence logic. Finally, we obtain some results on +the complexity of model checking, validity, and satisfiability of our logics. + +
+
+
+
+
+ + ♻ ☆ Quantum Advantage from One-Way Functions + + +
+ We demonstrate quantum advantage with several basic assumptions, specifically +based on only the existence of OWFs. We introduce inefficient-verifier proofs +of quantumness (IV-PoQ), and construct it from classical bit commitments. +IV-PoQ is an interactive protocol between a verifier and a quantum prover +consisting of two phases. In the first phase, the verifier is probabilistic +polynomial-time, and it interacts with the prover. In the second phase, the +verifier becomes inefficient, and makes its decision based on the transcript of +the first phase. If the prover is honest, the inefficient verifier accepts with +high probability, but any classical malicious prover only has a small +probability of being accepted by the inefficient verifier. Our construction +demonstrates the following results: (1)If one-way functions exist, then IV-PoQ +exist. (2)If distributional collision-resistant hash functions exist (which +exist if hard-on-average problems in $\mathbf{SZK}$ exist), then constant-round +IV-PoQ exist. We also demonstrate quantum advantage based on worst-case-hard +assumptions. We define auxiliary-input IV-PoQ (AI-IV-PoQ) that only require +that for any malicious prover, there exist infinitely many auxiliary inputs +under which the prover cannot cheat. We construct AI-IV-PoQ from an +auxiliary-input version of commitments in a similar way, showing that (1)If +auxiliary-input one-way functions exist (which exist if +$\mathbf{CZK}\not\subseteq\mathbf{BPP}$), then AI-IV-PoQ exist. (2)If +auxiliary-input collision-resistant hash functions exist (which is equivalent +to $\mathbf{PWPP}\nsubseteq \mathbf{FBPP}$) or $\mathbf{SZK}\nsubseteq +\mathbf{BPP}$, then constant-round AI-IV-PoQ exist. + +
+
+ comment: 52pages +
+
+
+
+
+
+
+
+ + Performance Profiling 1 + +
+
+
+ + ♻ ☆ Cache Blocking of Distributed-Memory Parallel Matrix Power Kernels + + +
+ Sparse matrix-vector products (SpMVs) are a bottleneck in many scientific +codes. Due to the heavy strain on the main memory interface from loading the +sparse matrix and the possibly irregular memory access pattern, SpMV typically +exhibits low arithmetic intensity. Repeating these products multiple times with +the same matrix is required in many algorithms. This so-called matrix power +kernel (MPK) provides an opportunity for data reuse since the same matrix data +is loaded from main memory multiple times, an opportunity that has only +recently been exploited successfully with the Recursive Algebraic Coloring +Engine (RACE). Using RACE, one considers a graph based formulation of the SpMV +and employs s level-based implementation of SpMV for reuse of relevant matrix +data. However, the underlying data dependencies have restricted the use of this +concept to shared memory parallelization and thus to single compute nodes. +Enabling cache blocking for distributed-memory parallelization of MPK is +challenging due to the need for explicit communication and synchronization of +data in neighboring levels. In this work, we propose and implement a flexible +method that interleaves the cache-blocking capabilities of RACE with an MPI +communication scheme that fulfills all data dependencies among processes. +Compared to a "traditional" distributed memory parallel MPK, our new +Distributed Level-Blocked MPK yields substantial speed-ups on modern Intel and +AMD architectures across a wide range of sparse matrices from various +scientific applications. Finally, we address a modern quantum physics problem +to demonstrate the applicability of our method, achieving a speed-up of up to +4x on 832 cores of an Intel Sapphire Rapids cluster. + +
+
+ comment: 15 pages, 12 figures, 5 tables; added affiliation & extended + acknowledgment +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 5 + +
+
+
+ + ☆ Non-Deterministic Planning for Hyperproperty Verification + + +
+ Non-deterministic planning aims to find a policy that achieves a given +objective in an environment where actions have uncertain effects, and the agent +- potentially - only observes parts of the current state. Hyperproperties are +properties that relate multiple paths of a system and can, e.g., capture +security and information-flow policies. Popular logics for expressing temporal +hyperproperties - such as HyperLTL - extend LTL by offering selective +quantification over executions of a system. In this paper, we show that +planning offers a powerful intermediate language for the automated verification +of hyperproperties. Concretely, we present an algorithm that, given a HyperLTL +verification problem, constructs a non-deterministic multi-agent planning +instance (in the form of a QDec-POMDP) that, when admitting a plan, implies the +satisfaction of the verification problem. We show that for large fragments of +HyperLTL, the resulting planning instance corresponds to a classical, FOND, or +POND planning problem. We implement our encoding in a prototype verification +tool and report on encouraging experimental results. + +
+
+ comment: ICAPS 2024 +
+
+
+
+
+ + ♻ ☆ Equivariant ideals of polynomials + + +
+ We study existence and computability of finite bases for ideals of +polynomials over infinitely many variables. In our setting, variables come from +a countable logical structure A, and embeddings from A to A act on polynomials +by renaming variables. First, we give a sufficient and necessary condition for +A to guarantee the following generalisation of Hilbert's Basis Theorem: every +polynomial ideal which is equivariant, i.e. invariant under renaming of +variables, is finitely generated. Second, we develop an extension of classical +Buchberger's algorithm to compute a Gr\"obner basis of a given equivariant +ideal. This implies decidability of the membership problem for equivariant +ideals. Finally, we sketch upon various applications of these results to +register automata, Petri nets with data, orbit-finitely generated vector +spaces, and orbit-finite systems of linear equations. + +
+
+
+
+
+ + ♻ ☆ Masked Hard-Attention Transformers Recognize Exactly the Star-Free + Languages + + +
+ The expressive power of transformers over inputs of unbounded size can be +studied through their ability to recognize classes of formal languages. We +consider transformer encoders with hard attention (in which all attention is +focused on exactly one position) and strict future masking (in which each +position only attends to positions strictly to its left), and prove that they +are equivalent to linear temporal logic (LTL), which defines exactly the +star-free languages. A key technique is the use of Boolean RASP as a convenient +intermediate language between transformers and LTL. We then take numerous +results known for LTL and apply them to transformers, characterizing how +position embeddings, strict masking, and depth increase expressive power. + +
+
+
+
+
+ + ♻ ☆ Languages of Higher-Dimensional Timed Automata + + +
+ We present a new language semantics for real-time concurrency. Its +operational models are higher-dimensional timed automata (HDTAs), a +generalization of both higher-dimensional automata and timed automata. We +define languages of HDTAs as sets of interval-timed pomsets with interfaces. As +an application, we show that language inclusion of HDTAs is undecidable. On the +other hand, using a region construction we can show that untimings of HDTA +languages have enough regularity so that untimed language inclusion is +decidable. + +
+
+
+
+
+ + ♻ ☆ Asymptotic bounds for the number of closed and privileged words + + +
+ A word~$w$ has a border $u$ if $u$ is a non-empty proper prefix and suffix of +$u$. A word~$w$ is said to be \emph{closed} if $w$ is of length at most $1$ or +if $w$ has a border that occurs exactly twice in $w$. A word~$w$ is said to be +\emph{privileged} if $w$ is of length at most $1$ or if $w$ has a privileged +border that occurs exactly twice in $w$. Let $C_k(n)$ (resp.~$P_k(n)$) be the +number of length-$n$ closed (resp. privileged) words over a $k$-letter +alphabet. In this paper, we improve existing upper and lower bounds on $C_k(n)$ +and $P_k(n)$. We completely resolve the asymptotic behaviour of $C_k(n)$. We +also nearly completely resolve the asymptotic behaviour of $P_k(n)$ by giving a +family of upper and lower bounds that are separated by a factor that grows +arbitrarily slowly. + +
+
+
+
+
+
+
+
+ + Logic in Computer Science 15 + +
+
+
+ + ☆ Towards Counting Markov Equivalence Classes with Logical Constraints + + +
+ We initiate the study of counting Markov Equivalence Classes (MEC) under +logical constraints. MECs are equivalence classes of Directed Acyclic Graphs +(DAGs) that encode the same conditional independence structure among the random +variables of a DAG model. Observational data can only allow to infer a DAG +model up to Markov Equivalence. However, Markov equivalent DAGs can represent +different causal structures, potentially super-exponentially many. Hence, +understanding MECs combinatorially is critical to understanding the complexity +of causal inference. In this paper, we focus on analysing MECs of size one, +with logical constraints on the graph topology. We provide a polynomial-time +algorithm (w.r.t. the number of nodes) for enumerating essential DAGs (the only +members of an MEC of size one) with arbitrary logical constraints expressed in +first-order logic with two variables and counting quantifiers (C^2). Our work +brings together recent developments in tractable first-order model counting and +combinatorics of MECs. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Traffic Scenario Logic: A Spatial-Temporal Logic for Modeling and + Reasoning of Urban Traffic Scenarios + + +
+ Formal representations of traffic scenarios can be used to generate test +cases for the safety verification of autonomous driving. However, most existing +methods are limited in highway or highly simplified intersection scenarios due +to the intricacy and diversity of traffic scenarios. In response, we propose +Traffic Scenario Logic (TSL), which is a spatial-temporal logic designed for +modeling and reasoning of urban pedestrian-free traffic scenarios. TSL provides +a formal representation of the urban road network that can be derived from +OpenDRIVE, i.e., the de facto industry standard of high-definition maps for +autonomous driving, enabling the representation of a broad range of traffic +scenarios. We implemented the reasoning of TSL using Telingo, i.e., a solver +for temporal programs based on the Answer Set Programming, and tested it on +different urban road layouts. Demonstrations show the effectiveness of TSL in +test scenario generation and its potential value in areas like decision-making +and control verification of autonomous driving. + +
+
+ comment: Submitted to KR 2024 +
+
+
+
+
+ + ☆ Safe and Personalizable Logical Guidance for Trajectory Planning of + Autonomous Driving SC 2024 + + +
+ Autonomous vehicles necessitate a delicate balance between safety, +efficiency, and user preferences in trajectory planning. Existing traditional +or learning-based methods face challenges in adequately addressing all these +aspects. In response, this paper proposes a novel component termed the Logical +Guidance Layer (LGL), designed for seamless integration into autonomous driving +trajectory planning frameworks, specifically tailored for highway scenarios. +The LGL guides the trajectory planning with a local target area determined +through scenario reasoning, scenario evaluation, and guidance area calculation. +Integrating the Responsibility-Sensitive Safety (RSS) model, the LGL ensures +formal safety guarantees while accommodating various user preferences defined +by logical formulae. Experimental validation demonstrates the effectiveness of +the LGL in achieving a balance between safety and efficiency, and meeting user +preferences in autonomous highway driving scenarios. + +
+
+ comment: Submitted to ITSC 2024 +
+
+
+
+
+ + ☆ The complexity of deciding characteristic formulae in van Glabbeek's + branching-time spectrum + + +
+ Characteristic formulae give a complete logical description of the behaviour +of processes modulo some chosen notion of behavioural semantics. They allow one +to reduce equivalence or preorder checking to model checking, and are exactly +the formulae in the modal logics characterizing classic behavioural +equivalences and preorders for which model checking can be reduced to +equivalence or preorder checking. + This paper studies the complexity of determining whether a formula is +characteristic for some finite, loop-free process in each of the logics +providing modal characterizations of the simulation-based semantics in van +Glabbeek's branching-time spectrum. Since characteristic formulae in each of +those logics are exactly the consistent and prime ones, it presents complexity +results for the satisfiability and primality problems, and investigates the +boundary between modal logics for which those problems can be solved in +polynomial time and those for which they become computationally hard. + Amongst other contributions, this article also studies the complexity of +constructing characteristic formulae in the modal logics characterizing +simulation-based semantics, both when such formulae are presented in explicit +form and via systems of equations. + +
+
+ comment: 64 pages, 1 figure +
+
+
+
+
+ + ☆ Tools at the Frontiers of Quantitative Verification + + +
+ The analysis of formal models that include quantitative aspects such as +timing or probabilistic choices is performed by quantitative verification +tools. Broad and mature tool support is available for computing basic +properties such as expected rewards on basic models such as Markov chains. +Previous editions of QComp, the comparison of tools for the analysis of +quantitative formal models, focused on this setting. Many application +scenarios, however, require more advanced property types such as LTL and +parameter synthesis queries as well as advanced models like stochastic games +and partially observable MDPs. For these, tool support is in its infancy today. +This paper presents the outcomes of QComp 2023: a survey of the state of the +art in quantitative verification tool support for advanced property types and +models. With tools ranging from first research prototypes to well-supported +integrations into established toolsets, this report highlights today's active +areas and tomorrow's challenges in tool-focused research for quantitative +verification. + +
+
+
+
+
+ + ☆ Non-Deterministic Planning for Hyperproperty Verification + + +
+ Non-deterministic planning aims to find a policy that achieves a given +objective in an environment where actions have uncertain effects, and the agent +- potentially - only observes parts of the current state. Hyperproperties are +properties that relate multiple paths of a system and can, e.g., capture +security and information-flow policies. Popular logics for expressing temporal +hyperproperties - such as HyperLTL - extend LTL by offering selective +quantification over executions of a system. In this paper, we show that +planning offers a powerful intermediate language for the automated verification +of hyperproperties. Concretely, we present an algorithm that, given a HyperLTL +verification problem, constructs a non-deterministic multi-agent planning +instance (in the form of a QDec-POMDP) that, when admitting a plan, implies the +satisfaction of the verification problem. We show that for large fragments of +HyperLTL, the resulting planning instance corresponds to a classical, FOND, or +POND planning problem. We implement our encoding in a prototype verification +tool and report on encouraging experimental results. + +
+
+ comment: ICAPS 2024 +
+
+
+
+
+ + ☆ Analogical proportions II + + +
+ Analogical reasoning is the ability to detect parallels between two seemingly +distant objects or situations, a fundamental human capacity used for example in +commonsense reasoning, learning, and creativity which is believed by many +researchers to be at the core of human and artificial general intelligence. +Analogical proportions are expressions of the form ``$a$ is to $b$ what $c$ is +to $d$'' at the core of analogical reasoning. The author has recently +introduced an abstract algebraic framework of analogical proportions within the +general setting of universal algebra. It is the purpose of this paper to +further develop the mathematical theory of analogical proportions within that +framework as motivated by the fact that it has already been successfully +applied to logic program synthesis in artificial intelligence. + +
+
+
+
+
+ + ☆ A Coherence Construction for the Propositional Universe + + +
+ We record a particularly simple construction on top of Lumsdaine's local +universes that allows for a Coquand-style universe of propositions with +propositional extensionality to be interpreted in a category with subobject +classifiers. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Source-level reasoning for quantitative information flow + + +
+ We present a novel formal system for proving quantitative-leakage properties +of programs. Based on a theory of Quantitative Information Flow (QIF) that +models information leakage as a noisy communication channel, it uses +"gain-functions" for the description and measurement of expected leaks. + We use a small imperative programming language, augmented with leakage +features, and with it express adversaries' activities in the style of, but more +generally than, the Hoare triples or expectation transformers that +traditionally express deterministic or probabilistic correctness but without +information flow. + The programs are annotated with "gain-expressions" that capture simple +adversarial settings such as "Guess the secret in one try." but also much more +general ones; and our formal syntax and logic -based framework enables us to +transform such gain-expressions that apply after a program has finished to ones +that equivalently apply before the program has begun. + In that way we enable a formal proof-based reasoning system for QIF at the +source level. We apply it to the %programming language we have chosen, and +demonstrate its effectiveness in a number of small but sometimes intricate +situations. + +
+
+
+
+
+ + ☆ Verifying Lock-free Search Structure Templates + + +
+ We present and verify template algorithms for lock-free concurrent search +structures that cover a broad range of existing implementations based on lists +and skiplists. Our linearizability proofs are fully mechanized in the +concurrent separation logic Iris. The proofs are modular and cover the broader +design space of the underlying algorithms by parameterizing the verification +over aspects such as the low-level representation of nodes and the style of +data structure maintenance. As a further technical contribution, we present a +mechanization of a recently proposed method for reasoning about +future-dependent linearization points using hindsight arguments. The +mechanization builds on Iris' support for prophecy reasoning and user-defined +ghost resources. We demonstrate that the method can help to reduce the proof +effort compared to direct prophecy-based proofs. + +
+
+ comment: Extended version of an article to appear in ECOOP'24 +
+
+
+
+
+ + ♻ ☆ Masked Hard-Attention Transformers Recognize Exactly the Star-Free + Languages + + +
+ The expressive power of transformers over inputs of unbounded size can be +studied through their ability to recognize classes of formal languages. We +consider transformer encoders with hard attention (in which all attention is +focused on exactly one position) and strict future masking (in which each +position only attends to positions strictly to its left), and prove that they +are equivalent to linear temporal logic (LTL), which defines exactly the +star-free languages. A key technique is the use of Boolean RASP as a convenient +intermediate language between transformers and LTL. We then take numerous +results known for LTL and apply them to transformers, characterizing how +position embeddings, strict masking, and depth increase expressive power. + +
+
+
+
+
+ + ♻ ☆ Equivariant ideals of polynomials + + +
+ We study existence and computability of finite bases for ideals of +polynomials over infinitely many variables. In our setting, variables come from +a countable logical structure A, and embeddings from A to A act on polynomials +by renaming variables. First, we give a sufficient and necessary condition for +A to guarantee the following generalisation of Hilbert's Basis Theorem: every +polynomial ideal which is equivariant, i.e. invariant under renaming of +variables, is finitely generated. Second, we develop an extension of classical +Buchberger's algorithm to compute a Gr\"obner basis of a given equivariant +ideal. This implies decidability of the membership problem for equivariant +ideals. Finally, we sketch upon various applications of these results to +register automata, Petri nets with data, orbit-finitely generated vector +spaces, and orbit-finite systems of linear equations. + +
+
+
+
+
+ + ♻ ☆ Logics with probabilistic team semantics and the Boolean negation + + +
+ We study the expressivity and the complexity of various logics in +probabilistic team semantics with the Boolean negation. In particular, we study +the extension of probabilistic independence logic with the Boolean negation, +and a recently introduced logic FOPT. We give a comprehensive picture of the +relative expressivity of these logics together with the most studied logics in +probabilistic team semantics setting, as well as relating their expressivity to +a numerical variant of second-order logic. In addition, we introduce novel +entropy atoms and show that the extension of first-order logic by entropy atoms +subsumes probabilistic independence logic. Finally, we obtain some results on +the complexity of model checking, validity, and satisfiability of our logics. + +
+
+
+
+
+ + ♻ ☆ Languages of Higher-Dimensional Timed Automata + + +
+ We present a new language semantics for real-time concurrency. Its +operational models are higher-dimensional timed automata (HDTAs), a +generalization of both higher-dimensional automata and timed automata. We +define languages of HDTAs as sets of interval-timed pomsets with interfaces. As +an application, we show that language inclusion of HDTAs is undecidable. On the +other hand, using a region construction we can show that untimings of HDTA +languages have enough regularity so that untimed language inclusion is +decidable. + +
+
+
+
+
+ + ♻ ☆ Composing Codensity Bisimulations + + +
+ Proving compositionality of behavioral equivalence on state-based systems +with respect to algebraic operations is a classical and widely studied problem. +We study a categorical formulation of this problem, where operations on +state-based systems modeled as coalgebras can be elegantly captured through +distributive laws between functors. To prove compositionality, it then suffices +to show that this distributive law lifts from sets to relations, giving an +explanation of how behavioral equivalence on smaller systems can be combined to +obtain behavioral equivalence on the composed system. + In this paper, we refine this approach by focusing on so-called codensity +lifting of functors, which gives a very generic presentation of various notions +of (bi)similarity as well as quantitative notions such as behavioral metrics on +probabilistic systems. The key idea is to use codensity liftings both at the +level of algebras and coalgebras, using a new generalization of the codensity +lifting. The problem of lifting distributive laws then reduces to the abstract +problem of constructing distributive laws between codensity liftings, for which +we propose a simplified sufficient condition. Our sufficient condition +instantiates to concrete proof methods for compositionality of algebraic +operations on various types of state-based systems. We instantiate our results +to prove compositionality of qualitative and quantitative properties of +deterministic automata. We also explore the limits of our approach by including +an example of probabilistic systems, where it is unclear whether the sufficient +condition holds, and instead we use our setting to give a direct proof of +compositionality. ... + +
+
+ comment: Extended version (includes the Appendix) of the paper accepted at + LiCS-24 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Distributed, Parallel, and Cluster Computing 18 + +
+
+
+ + ☆ Decentralized Federated Learning Over Imperfect Communication Channels + + +
+ This paper analyzes the impact of imperfect communication channels on +decentralized federated learning (D-FL) and subsequently determines the optimal +number of local aggregations per training round, adapting to the network +topology and imperfect channels. We start by deriving the bias of locally +aggregated D-FL models under imperfect channels from the ideal global models +requiring perfect channels and aggregations. The bias reveals that excessive +local aggregations can accumulate communication errors and degrade convergence. +Another important aspect is that we analyze a convergence upper bound of D-FL +based on the bias. By minimizing the bound, the optimal number of local +aggregations is identified to balance a trade-off with accumulation of +communication errors in the absence of knowledge of the channels. With this +knowledge, the impact of communication errors can be alleviated, allowing the +convergence upper bound to decrease throughout aggregations. Experiments +validate our convergence analysis and also identify the optimal number of local +aggregations on two widely considered image classification tasks. It is seen +that D-FL, with an optimal number of local aggregations, can outperform its +potential alternatives by over 10% in training accuracy. + +
+
+
+
+
+ + ☆ Application Layer Cyber Deception without Developer Interaction + + +
+ Cyber deception techniques that are tightly intertwined with applications +pose significant technical challenges in production systems. Security measures +are usually the responsibility of a system operator, but they are typically +limited to accessing built software artifacts, not their source code. This +limitation makes it particularly challenging to deploy cyber deception +techniques at application runtime and without full control over the software +development lifecycle. This work reviews 19 technical methods to accomplish +this and evaluates them based on technical, topological, operational, and +efficacy properties. We find some novel techniques beyond honeypots and reverse +proxies that seem to have received little research interest despite their +promise for cyber deception. We believe that overcoming these technical +challenges can drive the adoption of more dynamic and personalized cyber +deception techniques, tailored to specific classes of applications. + +
+
+ comment: to be published in the 3rd Workshop on Active Defense and Deception + (ADnD 2024) +
+
+
+
+
+ + ☆ Sorting in One and Two Rounds using $t$-Comparators + + +
+ We examine sorting algorithms for $n$ elements whose basic operation is +comparing $t$ elements simultaneously (a $t$-comparator). We focus on +algorithms that use only a single round or two rounds -- comparisons performed +in the second round depend on the outcomes of the first round comparators. + We design deterministic and randomized algorithms. In the deterministic case, +we show an interesting relation to design theory (namely, to 2-Steiner +systems), which yields a single-round optimal algorithm for $n=t^{2^k}$ with +any $k\ge 1$ and a variety of possible values of $t$. For some values of $t$, +however, no algorithm can reach the optimal (information-theoretic) bound on +the number of comparators. For this case (and any other $n$ and $t$), we show +an algorithm that uses at most three times as many comparators as the +theoretical bound. + We also design a randomized Las-Vegas two-rounds sorting algorithm for any +$n$ and $t$. Our algorithm uses an asymptotically optimal number of +$O(\max(\frac{n^{3/2}}{t^2},\frac{n}{t}))$ comparators, with high probability, +i.e., with probability at least $1-1/n$. The analysis of this algorithm +involves the gradual unveiling of randomness, using a novel technique which we +coin the binary tree of deferred randomness. + +
+
+
+
+
+ + ☆ TempoScale: A Cloud Workloads Prediction Approach Integrating Short-Term + and Long-Term Information + + +
+ Cloud native solutions are widely applied in various fields, placing higher +demands on the efficient management and utilization of resource platforms. To +achieve the efficiency, load forecasting and elastic scaling have become +crucial technologies for dynamically adjusting cloud resources to meet user +demands and minimizing resource waste. However, existing prediction-based +methods lack comprehensive analysis and integration of load characteristics +across different time scales. For instance, long-term trend analysis helps +reveal long-term changes in load and resource demand, thereby supporting +proactive resource allocation over longer periods, while short-term volatility +analysis can examine short-term fluctuations in load and resource demand, +providing support for real-time scheduling and rapid response. In response to +this, our research introduces TempoScale, which aims to enhance the +comprehensive understanding of temporal variations in cloud workloads, enabling +more intelligent and adaptive decision-making for elastic scaling. TempoScale +utilizes the Complete Ensemble Empirical Mode Decomposition with Adaptive Noise +algorithm to decompose time-series load data into multiple Intrinsic Mode +Functions (IMF) and a Residual Component (RC). First, we integrate the IMF, +which represents both long-term trends and short-term fluctuations, into the +time series prediction model to obtain intermediate results. Then, these +intermediate results, along with the RC, are transferred into a fully connected +layer to obtain the final result. Finally, this result is fed into the resource +management system based on Kubernetes for resource scaling. Our proposed +approach can reduce the Mean Square Error by 5.80% to 30.43% compared to the +baselines, and reduce the average response time by 5.58% to 31.15%. + +
+
+ comment: 11pages, 11 figures, 4 tables +
+
+
+
+
+ + ☆ Maverick-Aware Shapley Valuation for Client Selection in Federated + Learning + + +
+ Federated Learning (FL) allows clients to train a model collaboratively +without sharing their private data. One key challenge in practical FL systems +is data heterogeneity, particularly in handling clients with rare data, also +referred to as Mavericks. These clients own one or more data classes +exclusively, and the model performance becomes poor without their +participation. Thus, utilizing Mavericks throughout training is crucial. In +this paper, we first design a Maverick-aware Shapley valuation that fairly +evaluates the contribution of Mavericks. The main idea is to compute the +clients' Shapley values (SV) class-wise, i.e., per label. Next, we propose +FedMS, a Maverick-Shapley client selection mechanism for FL that intelligently +selects the clients that contribute the most in each round, by employing our +Maverick-aware SV-based contribution score. We show that, compared to an +extensive list of baselines, FedMS achieves better model performance and fairer +Shapley Rewards distribution. + +
+
+
+
+
+ + ☆ Carbon-aware Software Services + + +
+ The significant carbon footprint of the ICT sector calls for methodologies to +contain carbon emissions of running software. This article proposes a novel +framework for implementing, configuring and assessing carbon-aware interactive +software services. First, we propose a methodology to implement carbon-aware +services leveraging the Strategy design pattern to feature alternative service +versions with different energy consumption. Then, we devise a bilevel +optimisation scheme to configure which version to use at different times of the +day, based on forecasts of carbon intensity and service requests, pursuing the +two-fold goal of minimising carbon emissions and maintaining average output +quality above a desired set-point. Last, an open-source prototype of such +optimisation scheme is used to configure a software service implemented as per +our methodology and assessed against traditional non-adaptive implementations +of the same service. Results show the capability of our framework to control +the average quality of output results of carbon-aware services and to reduce +carbon emissions from 8% to 50%. + +
+
+
+
+
+ + ☆ Cache Blocking of Distributed-Memory Parallel Matrix Power Kernels + + +
+ Sparse matrix-vector products (SpMVs) are a bottleneck in many scientific +codes. Due to the heavy strain on the main memory interface from loading the +sparse matrix and the possibly irregular memory access pattern, SpMV typically +exhibits low arithmetic intensity. Repeating these products multiple times with +the same matrix is required in many algorithms. This so-called matrix power +kernel (MPK) provides an opportunity for data reuse since the same matrix data +is loaded from main memory multiple times, an opportunity that has only +recently been exploited successfully with the Recursive Algebraic Coloring +Engine (RACE). Using RACE, one considers a graph based formulation of the SpMV +and employs s level-based implementation of SpMV for reuse of relevant matrix +data. However, the underlying data dependencies have restricted the use of this +concept to shared memory parallelization and thus to single compute nodes. +Enabling cache blocking for distributed-memory parallelization of MPK is +challenging due to the need for explicit communication and synchronization of +data in neighboring levels. In this work, we propose and implement a flexible +method that interleaves the cache-blocking capabilities of RACE with an MPI +communication scheme that fulfills all data dependencies among processes. +Compared to a "traditional" distributed memory parallel MPK, our new +Distributed Level-Blocked MPK yields substantial speed-ups on modern Intel and +AMD architectures across a wide range of sparse matrices from various +scientific applications. Finally, we address a modern quantum physics problem +to demonstrate the applicability of our method, achieving a speed-up of up to +4x on 832 cores of an Intel Sapphire Rapids cluster. + +
+
+ comment: 14 pages, 12 figures, 5 tables +
+
+
+
+
+ + ☆ MOSS: A Large-scale Open Microscopic Traffic Simulation System SC 2024 + + +
+ In the research of Intelligent Transportation Systems (ITS), traffic +simulation is a key procedure for the evaluation of new methods and +optimization of strategies. However, existing traffic simulation systems face +two challenges. First, how to balance simulation scale with realism is a +dilemma. Second, it is hard to simulate realistic results, which requires +realistic travel demand data and simulator. These problems limit computer-aided +optimization of traffic management strategies for large-scale road networks and +reduce the usability of traffic simulations in areas where real-world travel +demand data are lacking. To address these problems, we design and implement +MObility Simulation System (MOSS). MOSS adopts GPU acceleration to +significantly improve the efficiency and scale of microscopic traffic +simulation, which enables realistic and fast simulations for large-scale road +networks. It provides realistic travel Origin-Destination (OD) matrices +generation through a pre-trained generative neural network model based on +publicly available data on a global scale, such as satellite imagery, to help +researchers build meaningful travel demand data. It also provides a complete +open toolchain to help users with road network construction, demand generation, +simulation, and result analysis. The whole toolchain including the simulator +can be accessed at https://moss.fiblab.net and the codes are open-source for +community collaboration. + +
+
+ comment: Submitted to IEEE ITSC 2024 +
+
+
+
+
+ + ☆ Data Sharing at the Edge of the Network: A Disturbance Resilient + Multi-modal ITS + + +
+ Mobility-as-a-Service (MaaS) is a paradigm that encourages the shift from +private cars to more sustainable alternative mobility services. MaaS provides +services that enhances and enables multiple modes of transport to operate +seamlessly and bringing Multimodal Intelligent Transport Systems (M-ITS) closer +to reality. This requires sharing and integration of data collected from +multiple sources including modes of transports, sensors, and end-users' devices +to allow a seamless and integrated services especially during unprecedented +disturbances. This paper discusses the interactions among transportation modes, +networks, potential disturbance scenarios, and adaptation strategies to +mitigate their impact on MaaS. We particularly discuss the need to share data +between the modes of transport and relevant entities that are at the vicinity +of each other, taking advantage of edge computing technology to avoid any +latency due to communication to the cloud and privacy concerns. However, when +sharing at the edge, bandwidth, storage, and computational limitations must be +considered. + +
+
+
+
+
+ + ☆ FedASTA: Federated adaptive spatial-temporal attention for traffic flow + prediction + + +
+ Mobile devices and the Internet of Things (IoT) devices nowadays generate a +large amount of heterogeneous spatial-temporal data. It remains a challenging +problem to model the spatial-temporal dynamics under privacy concern. Federated +learning (FL) has been proposed as a framework to enable model training across +distributed devices without sharing original data which reduce privacy concern. +Personalized federated learning (PFL) methods further address data heterogenous +problem. However, these methods don't consider natural spatial relations among +nodes. For the sake of modeling spatial relations, Graph Neural Netowork (GNN) +based FL approach have been proposed. But dynamic spatial-temporal relations +among edge nodes are not taken into account. Several approaches model +spatial-temporal dynamics in a centralized environment, while less effort has +been made under federated setting. To overcome these challeges, we propose a +novel Federated Adaptive Spatial-Temporal Attention (FedASTA) framework to +model the dynamic spatial-temporal relations. On the client node, FedASTA +extracts temporal relations and trend patterns from the decomposed terms of +original time series. Then, on the server node, FedASTA utilize trend patterns +from clients to construct adaptive temporal-spatial aware graph which captures +dynamic correlation between clients. Besides, we design a masked spatial +attention module with both static graph and constructed adaptive graph to model +spatial dependencies among clients. Extensive experiments on five real-world +public traffic flow datasets demonstrate that our method achieves state-of-art +performance in federated scenario. In addition, the experiments made in +centralized setting show the effectiveness of our novel adaptive graph +construction approach compared with other popular dynamic spatial-temporal +aware methods. + +
+
+
+
+
+ + ♻ ☆ Vertical Federated Learning Hybrid Local Pre-training + + +
+ Vertical Federated Learning (VFL), which has a broad range of real-world +applications, has received much attention in both academia and industry. +Enterprises aspire to exploit more valuable features of the same users from +diverse departments to boost their model prediction skills. VFL addresses this +demand and concurrently secures individual parties from exposing their raw +data. However, conventional VFL encounters a bottleneck as it only leverages +aligned samples, whose size shrinks with more parties involved, resulting in +data scarcity and the waste of unaligned data. To address this problem, we +propose a novel VFL Hybrid Local Pre-training (VFLHLP) approach. VFLHLP first +pre-trains local networks on the local data of participating parties. Then it +utilizes these pre-trained networks to adjust the sub-model for the labeled +party or enhance representation learning for other parties during downstream +federated learning on aligned data, boosting the performance of federated +models. The experimental results on real-world advertising datasets, +demonstrate that our approach achieves the best performance over baseline +methods by large margins. The ablation study further illustrates the +contribution of each technique in VFLHLP to its overall performance. + +
+
+
+
+
+ + ♻ ☆ A GAN-Based Data Poisoning Attack Against Federated Learning Systems and + Its Countermeasure + + +
+ As a distributed machine learning paradigm, federated learning (FL) is +collaboratively carried out on privately owned datasets but without direct data +access. Although the original intention is to allay data privacy concerns, +"available but not visible" data in FL potentially brings new security threats, +particularly poisoning attacks that target such "not visible" local data. +Initial attempts have been made to conduct data poisoning attacks against FL +systems, but cannot be fully successful due to their high chance of causing +statistical anomalies. To unleash the potential for truly "invisible" attacks +and build a more deterrent threat model, in this paper, a new data poisoning +attack model named VagueGAN is proposed, which can generate seemingly +legitimate but noisy poisoned data by untraditionally taking advantage of +generative adversarial network (GAN) variants. Capable of manipulating the +quality of poisoned data on demand, VagueGAN enables to trade-off attack +effectiveness and stealthiness. Furthermore, a cost-effective countermeasure +named Model Consistency-Based Defense (MCD) is proposed to identify +GAN-poisoned data or models after finding out the consistency of GAN outputs. +Extensive experiments on multiple datasets indicate that our attack method is +generally much more stealthy as well as more effective in degrading FL +performance with low complexity. Our defense method is also shown to be more +competent in identifying GAN-poisoned data or models. The source codes are +publicly available at +\href{https://github.com/SSssWEIssSS/VagueGAN-Data-Poisoning-Attack-and-Its-Countermeasure}{https://github.com/SSssWEIssSS/VagueGAN-Data-Poisoning-Attack-and-Its-Countermeasure}. + +
+
+ comment: 18 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Near Optimal Bounds for Replacement Paths and Related Problems in the + CONGEST Model + + +
+ We present several results in the CONGEST model on round complexity for +Replacement Paths (RPaths), Minimum Weight Cycle (MWC), and All Nodes Shortest +Cycles (ANSC). We study these fundamental problems in both directed and +undirected graphs, both weighted and unweighted. Many of our results are +optimal to within a polylog factor: For an $n$-node graph $G$ we establish near +linear lower and upper bounds for computing RPaths if $G$ is directed and +weighted, and for computing MWC and ANSC if $G$ is weighted, directed or +undirected; near $\sqrt{n}$ lower and upper bounds for undirected weighted +RPaths; and $\Theta(D)$ bound for undirected unweighted RPaths. We also present +lower and upper bounds for approximation versions of these problems, notably a +$(2-(1/g))$-approximation algorithm for undirected unweighted MWC that runs in +$\tilde{O}(\sqrt{n}+D)$ rounds, improving on the previous best bound of +$\tilde{O}(\sqrt{ng}+D)$ rounds, where $g$ is the MWC length. We present a +$(1+\epsilon)$-approximation algorithm for directed weighted RPaths, which +beats the linear lower bound for exact RPaths. + +
+
+
+
+
+ + ♻ ☆ AdaptSFL: Adaptive Split Federated Learning in Resource-constrained Edge + Networks + + +
+ The increasing complexity of deep neural networks poses significant barriers +to democratizing them to resource-limited edge devices. To address this +challenge, split federated learning (SFL) has emerged as a promising solution +by of floading the primary training workload to a server via model partitioning +while enabling parallel training among edge devices. However, although system +optimization substantially influences the performance of SFL under +resource-constrained systems, the problem remains largely uncharted. In this +paper, we provide a convergence analysis of SFL which quantifies the impact of +model splitting (MS) and client-side model aggregation (MA) on the learning +performance, serving as a theoretical foundation. Then, we propose AdaptSFL, a +novel resource-adaptive SFL framework, to expedite SFL under +resource-constrained edge computing systems. Specifically, AdaptSFL adaptively +controls client-side MA and MS to balance communication-computing latency and +training convergence. Extensive simulations across various datasets validate +that our proposed AdaptSFL framework takes considerably less time to achieve a +target accuracy than benchmarks, demonstrating the effectiveness of the +proposed strategies. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Taking GPU Programming Models to Task for Performance Portability + + +
+ Portability is critical to ensuring high productivity in developing and +maintaining scientific software as the diversity in on-node hardware +architectures increases. While several programming models provide portability +for diverse GPU platforms, they don't make any guarantees about performance +portability. In this work, we explore several programming models -- CUDA, HIP, +Kokkos, RAJA, OpenMP, OpenACC, and SYCL, to study if the performance of these +models is consistently good across NVIDIA and AMD GPUs. We use five proxy +applications from different scientific domains, create implementations where +missing, and use them to present a comprehensive comparative evaluation of the +programming models. We provide a Spack scripting-based methodology to ensure +reproducibility of experiments conducted in this work. Finally, we attempt to +answer the question -- to what extent does each programming model provide +performance portability for heterogeneous systems in real-world usage? + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Approximate Byzantine Fault-Tolerance in Distributed Optimization + + +
+ This paper considers the problem of Byzantine fault-tolerance in distributed +multi-agent optimization. In this problem, each agent has a local cost +function, and in the fault-free case, the goal is to design a distributed +algorithm that allows all the agents to find a minimum point of all the agents' +aggregate cost function. We consider a scenario where some agents might be +Byzantine faulty that renders the original goal of computing a minimum point of +all the agents' aggregate cost vacuous. A more reasonable objective for an +algorithm in this scenario is to allow all the non-faulty agents to compute the +minimum point of only the non-faulty agents' aggregate cost. Prior work shows +that if there are up to $f$ (out of $n$) Byzantine agents then a minimum point +of the non-faulty agents' aggregate cost can be computed exactly if and only if +the non-faulty agents' costs satisfy a certain redundancy property called +$2f$-redundancy. However, $2f$-redundancy is an ideal property that can be +satisfied only in systems free from noise or uncertainties, which can make the +goal of exact fault-tolerance unachievable in some applications. Thus, we +introduce the notion of $(f,\epsilon)$-resilience, a generalization of exact +fault-tolerance wherein the objective is to find an approximate minimum point +of the non-faulty aggregate cost, with $\epsilon$ accuracy. This approximate +fault-tolerance can be achieved under a weaker condition that is easier to +satisfy in practice, compared to $2f$-redundancy. We obtain necessary and +sufficient conditions for achieving $(f,\epsilon)$-resilience characterizing +the correlation between relaxation in redundancy and approximation in +resilience. In case when the agents' cost functions are differentiable, we +obtain conditions for $(f,\epsilon)$-resilience of the distributed +gradient-descent method when equipped with robust gradient aggregation. + +
+
+ comment: 43 pages, 5 figures, and 1 table. The report is an important + extension to prior work https://dl.acm.org/doi/abs/10.1145/3382734.3405748, + and arXiv:2003.09675; Added an alternative result with a better analysis +
+
+
+
+
+ + ♻ ☆ Parallel and (Nearly) Work-Efficient Dynamic Programming + + +
+ The idea of dynamic programming (DP), proposed by Bellman in the 1950s, is +one of the most important algorithmic techniques. However, in parallel, many +fundamental and sequentially simple problems become more challenging, and open +to a (nearly) work-efficient solution (i.e., the work is off by at most a +polylogarithmic factor over the best sequential solution). In fact, sequential +DP algorithms employ many advanced optimizations such as decision monotonicity +or special data structures, and achieve better work than straightforward +solutions. Many such optimizations are inherently sequential, which creates +extra challenges for a parallel algorithm to achieve the same work bound. + The goal of this paper is to achieve (nearly) work-efficient parallel DP +algorithms by parallelizing classic, highly-optimized and practical sequential +algorithms. We show a general framework called the Cordon Algorithm for +parallel DP algorithms, and use it to solve several classic problems. Our +selection of problems includes Longest Increasing Subsequence (LIS), sparse +Longest Common Subsequence (LCS), convex/concave generalized Least Weight +Subsequence (LWS), Optimal Alphabetic Tree (OAT), and more. We show how the +Cordon Algorithm can be used to achieve the same level of optimization as the +sequential algorithms, and achieve good parallelism. Many of our algorithms are +conceptually simple, and we show some experimental results as +proofs-of-concept. + +
+
+
+
+
+ + ♻ ☆ Exploring the Design Space for Message-Driven Systems for Dynamic Graph + Processing using CCA + + +
+ Computer systems that have been successfully deployed for dense regular +workloads fall short of achieving scalability and efficiency when applied to +irregular and dynamic graph applications. Conventional computing systems rely +heavily on static, regular, numeric intensive computations while High +Performance Computing systems executing parallel graph applications exhibit +little locality, spatial or temporal, and are fine-grained and memory +intensive. With the strong interest in AI which depend on these very different +use cases combined with the end of Moore's Law at nanoscale, dramatic +alternatives in architecture and underlying execution models are required. This +paper identifies an innovative non-von Neumann architecture, Continuum Computer +Architecture (CCA), that redefines the nature of computing structures to yield +powerful innovations in computational methods to deliver a new generation of +highly parallel hardware architecture. CCA reflects a genus of highly parallel +architectures that while varying in specific quantities (e.g., memory blocks), +share a multiple of attributes not found in typical von Neumann machines. Among +these are memory-centric components, message-driven asynchronous flow control, +and lightweight out-of-order execution across a global name space. Together +these innovative non-von Neumann architectural properties guided by a new +original execution model will deliver the new future path for extending beyond +the von Neumann model. This paper documents a series of interrelated +experiments that together establish future directions for next generation +non-von Neumann architectures, especially for graph processing. + +
+
+
+
+
+
+
+
+ + Performance Profiling 3 + +
+
+
+ + ☆ Cache Blocking of Distributed-Memory Parallel Matrix Power Kernels + + +
+ Sparse matrix-vector products (SpMVs) are a bottleneck in many scientific +codes. Due to the heavy strain on the main memory interface from loading the +sparse matrix and the possibly irregular memory access pattern, SpMV typically +exhibits low arithmetic intensity. Repeating these products multiple times with +the same matrix is required in many algorithms. This so-called matrix power +kernel (MPK) provides an opportunity for data reuse since the same matrix data +is loaded from main memory multiple times, an opportunity that has only +recently been exploited successfully with the Recursive Algebraic Coloring +Engine (RACE). Using RACE, one considers a graph based formulation of the SpMV +and employs s level-based implementation of SpMV for reuse of relevant matrix +data. However, the underlying data dependencies have restricted the use of this +concept to shared memory parallelization and thus to single compute nodes. +Enabling cache blocking for distributed-memory parallelization of MPK is +challenging due to the need for explicit communication and synchronization of +data in neighboring levels. In this work, we propose and implement a flexible +method that interleaves the cache-blocking capabilities of RACE with an MPI +communication scheme that fulfills all data dependencies among processes. +Compared to a "traditional" distributed memory parallel MPK, our new +Distributed Level-Blocked MPK yields substantial speed-ups on modern Intel and +AMD architectures across a wide range of sparse matrices from various +scientific applications. Finally, we address a modern quantum physics problem +to demonstrate the applicability of our method, achieving a speed-up of up to +4x on 832 cores of an Intel Sapphire Rapids cluster. + +
+
+ comment: 14 pages, 12 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Count-Min Sketch with Conservative Updates: Worst-Case Analysis + + +
+ Count-Min Sketch with Conservative Updates (CMS-CU) is a memory-efficient +hash-based data structure used to estimate the occurrences of items within a +data stream. CMS-CU stores $m$ counters and employs $d$ hash functions to map +items to these counters. We first argue that the estimation error in CMS-CU is +maximal when each item appears at most once in the stream. Next, we study +CMS-CU in this setting. In the case where $d=m-1$, we prove that the average +estimation error and the average counter rate converge almost surely to +$\frac{1}{2}$, contrasting with the vanilla Count-Min Sketch, where the average +counter rate is equal to $\frac{m-1}{m}$. For any given $m$ and $d$, we prove +novel lower and upper bounds on the average estimation error, incorporating a +positive integer parameter $g$. Larger values of this parameter improve the +accuracy of the bounds. Moreover, the computation of each bound involves +examining an ergodic Markov process with a state space of size +$\binom{m+g-d}{g}$ and a sparse transition probabilities matrix containing +$\mathcal{O}(m\binom{m+g-d}{g})$ non-zero entries. For $d=m-1$, $g=1$, and as +$m\to \infty$, we show that the lower and upper bounds coincide. In general, +our bounds exhibit high accuracy for small values of $g$, as shown by numerical +computation. For example, for $m=50$, $d=4$, and $g=5$, the difference between +the lower and upper bounds is smaller than $10^{-4}$. + +
+
+
+
+
+ + ♻ ☆ Taking GPU Programming Models to Task for Performance Portability + + +
+ Portability is critical to ensuring high productivity in developing and +maintaining scientific software as the diversity in on-node hardware +architectures increases. While several programming models provide portability +for diverse GPU platforms, they don't make any guarantees about performance +portability. In this work, we explore several programming models -- CUDA, HIP, +Kokkos, RAJA, OpenMP, OpenACC, and SYCL, to study if the performance of these +models is consistently good across NVIDIA and AMD GPUs. We use five proxy +applications from different scientific domains, create implementations where +missing, and use them to present a comprehensive comparative evaluation of the +programming models. We provide a Spack scripting-based methodology to ensure +reproducibility of experiments conducted in this work. Finally, we attempt to +answer the question -- to what extent does each programming model provide +performance portability for heterogeneous systems in real-world usage? + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+
+
+
+ + Programming and Languages 9 + +
+
+
+ + ☆ A Sound Type System for Secure Currency Flow + + +
+ In this paper we focus on TinySol, a minimal calculus for Solidity smart +contracts, introduced by Bartoletti et al. We start by rephrasing its syntax +(to emphasise its object-oriented flavour) and give a new big-step operational +semantics. We then use it to define two security properties, namely call +integrity and noninterference. These two properties have some similarities in +their definition, in that they both require that some part of a program is not +influenced by the other part. However, we show that the two properties are +actually incomparable. Nevertheless, we provide a type system for +noninterference and show that well-typed programs satisfy call integrity as +well; hence, programs that are accepted by our type system satisfy both +properties. We finally discuss the practical usability of the type system and +its limitations by means of some simple examples. + +
+
+
+
+
+ + ☆ Unveiling the Power of Intermediate Representations for Static Analysis: + A Survey + + +
+ Static analysis techniques enhance the security, performance, and reliability +of programs by analyzing and portraiting program behaviors without the need for +actual execution. In essence, static analysis takes the Intermediate +Representation (IR) of a target program as input to retrieve essential program +information and understand the program. However, there is a lack of systematic +analysis on the benefit of IR for static analysis, besides serving as an +information provider. In general, a modern static analysis framework should +possess the ability to conduct diverse analyses on different languages, +producing reliable results with minimal time consumption, and offering +extensive customization options. In this survey, we systematically characterize +these goals and review the potential solutions from the perspective of IR. It +can serve as a manual for learners and practitioners in the static analysis +field to better understand IR design. Meanwhile, numerous research +opportunities are revealed for researchers. + +
+
+
+
+
+ + ☆ GeckoGraph: A Visual Language for Polymorphic Types + + +
+ Polymorphic types are an important feature in most strongly typed programming +languages. They allow functions to be written in a way that can be used with +different data types, while still enforcing the relationship and constraints +between the values. However, programmers often find polymorphic types difficult +to use and understand and tend to reason using concrete types. We propose +GeckoGraph, a graphical notation for types. GeckoGraph aims to accompany +traditional text-based type notation and to make reading, understanding, and +comparing types easier. We conducted a large-scale human study using GeckoGraph +compared to text-based type notation. To our knowledge, this is the largest +controlled user study on functional programming ever conducted. The results of +the study show that GeckoGraph helps improve programmers' ability to succeed in +the programming tasks we designed, especially for novice programmers. + +
+
+
+
+
+ + ☆ Goanna: Resolving Haskell Type Errors With Minimal Correction Subsets + + +
+ Statically typed languages offer significant advantages, such as bug +prevention, enhanced code quality, and reduced maintenance costs. However, +these benefits often come at the expense of a steep learning curve and a slower +development pace. Haskell, known for its expressive and strict type system, +poses challenges for inexperienced programmers in learning and using its type +system, especially in debugging type errors. We introduce Goanna, a novel tool +that serves as a type checker and an interactive type error debugging tool for +Haskell. When encountering type errors, Goanna identifies a comprehensive list +of potential causes and resolutions based on the minimum correction subsets +(MCS) enumeration. We evaluated Goanna's effectiveness using 86 diverse Haskell +programs from online discourse, demonstrating its ability to accurately +identify and resolve type errors. Additionally, we present a collection of +techniques and heuristics to enhance Goanna's suggestion-based error diagnosis +and show their effectiveness from our evaluation. + +
+
+
+
+
+ + ☆ Reduction Strategies in the Lambda Calculus and Their Implementation + through Derivable Abstract Machines: Introduction + + +
+ The lambda calculus since more than half a century is a model and foundation +of functional programming languages. However, lambda expressions can be +evaluated with different reduction strategies and thus, there is no fixed cost +model nor one canonical implementation for all applications of the lambda +calculus. + This article is an introduction to a dissertation is composed of four +conference papers where: we present a systematic survey of reduction strategies +of the lambda calculus; we take advantage of the functional correspondence as a +tool for studying implementations of the lambda calculus by deriving an +abstract machine for a precisely identified strong call-by-value reduction +strategy; we improve it to obtain an efficient abstract machine for strong call +by value and provide a time complexity analysis for the new machine with the +use of a potential function; and we present the first provably efficient +abstract machine for strong call by need. + +
+
+ comment: 37 pages, 12 figures, 2 tables, 4 code listings +
+
+
+
+
+ + ☆ Fully Randomized Pointers + + +
+ Software security continues to be a critical concern for programs implemented +in low-level programming languages such as C and C++. Many defenses have been +proposed in the current literature, each with different trade-offs including +performance, compatibility, and attack resistance. One general class of defense +is pointer randomization or authentication, where invalid object access (e.g., +memory errors) is obfuscated or denied. Many defenses rely on the program +termination (e.g., crashing) to abort attacks, with the implicit assumption +that an adversary cannot "brute force" the defense with multiple attack +attempts. However, such assumptions do not always hold, such as hardware +speculative execution attacks or network servers configured to restart on +error. In such cases, we argue that most existing defenses provide only weak +effective security. + In this paper, we propose Fully Randomized Pointers (FRP) as a stronger +memory error defense that is resistant to even brute force attacks. The key +idea is to fully randomize pointer bits -- as much as possible while also +preserving binary compatibility -- rendering the relationships between pointers +highly unpredictable. Furthermore, the very high degree of randomization +renders brute force attacks impractical -- providing strong effective security +compared to existing work. We design a new FRP encoding that is: (1) compatible +with existing binary code (without recompilation); (2) decoupled from the +underlying object layout; and (3) can be efficiently decoded on-the-fly to the +underlying memory address. We prototype FRP in the form of a software +implementation (BlueFat) to test security and compatibility, and a +proof-of-concept hardware implementation (GreenFat) to evaluate performance. We +show that FRP is secure, practical, and compatible at the binary level, while a +hardware implementation can achieve low performance overheads (<10%). + +
+
+ comment: 24 pages, 3 figures +
+
+
+
+
+ + ☆ Compiler support for semi-manual AoS-to-SoA conversions with data views + + +
+ The C programming language and its cousins such as C++ stipulate the static +storage of sets of structured data: Developers have to commit to one, invariant +data model -- typically a structure-of-arrays (SoA) or an array-of-structs +(AoS) -- unless they manually rearrange, i.e.~convert it throughout the +computation. Whether AoS or SoA is favourable depends on the execution context +and algorithm step. We propose a language extension based upon C++ attributes +through which developers can guide the compiler what memory arrangements are to +be used. The compiler can then automatically convert (parts of) the data into +the format of choice prior to a calculation and convert results back +afterwards. As all conversions are merely annotations, it is straightforward +for the developer to experiment with different storage formats and to pick +subsets of data that are subject to memory rearrangements. Our work implements +the annotations within Clang and demonstrates their potential impact through a +smoothed particle hydrodynamics (SPH) code. + +
+
+
+
+
+ + ♻ ☆ Monoidal closure of Grothendieck constructions via $Σ$-tractable + monoidal structures and Dialectica formulas + + +
+ We study the categorical structure of the Grothendieck construction of an +indexed category $\mathcal{L}:\mathcal{C}^{op}\to\mathbf{CAT}$ and characterise +fibred limits, colimits, and monoidal structures. Next, we give sufficient +conditions for the monoidal closure of the total category $\Sigma_\mathcal{C} +\mathcal{L}$ of a Grothendieck construction of an indexed category +$\mathcal{L}:\mathcal{C}^{op}\to\mathbf{CAT}$. Our analysis is a generalization +of G\"odel's Dialectica interpretation, and it relies on a novel notion of +$\Sigma$-tractable monoidal structure. As we will see, $\Sigma$-tractable +coproducts simultaneously generalize cocartesian coclosed structures, +biproducts and extensive coproducts. We analyse when the closed structure is +fibred -- usually it is not. + +
+
+
+
+
+ + ♻ ☆ AIOS Compiler: LLM as Interpreter for Natural Language Programming and + Flow Programming of AI Agents + + +
+ Since their inception, programming languages have trended towards greater +readability and lower barriers for programmers. Following this trend, natural +language can be a promising type of programming language that provides great +flexibility and usability and helps towards the democracy of programming. +However, the inherent vagueness, ambiguity, and verbosity of natural language +pose significant challenges in developing an interpreter that can accurately +understand the programming logic and execute instructions written in natural +language. Fortunately, recent advancements in Large Language Models (LLMs) have +demonstrated remarkable proficiency in interpreting complex natural language. +Inspired by this, we develop a novel system for Code Representation and +Execution (CoRE), which employs LLM as interpreter to interpret and execute +natural language instructions. The proposed system unifies natural language +programming, pseudo-code programming, and flow programming under the same +representation for constructing language agents, while LLM serves as the +interpreter to interpret and execute the agent programs. In this paper, we +begin with defining the programming syntax that structures natural language +instructions logically. During the execution, we incorporate external memory to +minimize redundancy. Furthermore, we equip the designed interpreter with the +capability to invoke external tools, compensating for the limitations of LLM in +specialized domains or when accessing real-time information. This work is +open-source at https://github.com/agiresearch/CoRE, +https://github.com/agiresearch/OpenAGI, and +https://github.com/agiresearch/AIOS. + +
+
+ comment: 12 pages, 6 figures, comments and suggestions are welcome +
+
+
+
+
+
+
+
+ + Computational Complexity 7 + +
+
+
+ + ☆ Ergodic Unobservable MDPs: Decidability of Approximation + + +
+ Unobservable Markov decision processes (UMDPs) serve as a prominent +mathematical framework for modeling sequential decision-making problems. A key +aspect in computational analysis is the consideration of decidability, which +concerns the existence of algorithms. In general, the computation of the exact +and approximated values is undecidable for UMDPs with the long-run average +objective. Building on matrix product theory and ergodic properties, we +introduce a novel subclass of UMDPs, termed ergodic UMDPs. Our main result +demonstrates that approximating the value within this subclass is decidable. +However, we show that the exact problem remains undecidable. Finally, we +discuss the primary challenges of extending these results to partially +observable Markov decision processes. + +
+
+
+
+
+ + ☆ A Subexponential Reduction from Product Partition to Subset Sum + + +
+ In this paper we study the Product Partition Problem (PPP), i.e. we are given +a set of $n$ natural numbers represented on $m$ bits each and we are asked if a +subset exists such that the product of the numbers in the subset equals the +product of the numbers not in the subset. Our approach is to obtain the integer +factorization of each number. This is the subexponential step. We then form a +matrix with the exponents of the primes and show that the PPP has a solution +iff some Subset Sum Problems have a common solution. Finally, using the fact +that the exponents are not large we combine all the Subset Sum Problems in a +single Subset Sum Problem (SSP) and show that its size is polynomial in $m,n$. +We show that the PPP has a solution iff the final SSP has one. + +
+
+
+
+
+ + ☆ Pseudorandomness, symmetry, smoothing: I + + +
+ We prove several new results about bounded uniform and small-bias +distributions. A main message is that, small-bias, even perturbed with noise, +does not fool several classes of tests better than bounded uniformity. We prove +this for threshold tests, small-space algorithms, and small-depth circuits. In +particular, we obtain small-bias distributions that + 1) achieve an optimal lower bound on their statistical distance to any +bounded-uniform distribution. This closes a line of research initiated by Alon, +Goldreich, and Mansour in 2003, and improves on a result by O'Donnell and Zhao. + 2) have heavier tail mass than the uniform distribution. This answers a +question posed by several researchers including Bun and Steinke. + 3) rule out a popular paradigm for constructing pseudorandom generators, +originating in a 1989 work by Ajtai and Wigderson. This again answers a +question raised by several researchers. For branching programs, our result +matches a bound by Forbes and Kelley. + Our small-bias distributions above are symmetric. We show that the xor of any +two symmetric small-bias distributions fools any bounded function. Hence our +examples cannot be extended to the xor of two small-bias distributions, another +popular paradigm whose power remains unknown. We also generalize and simplify +the proof of a result of Bazzi. + +
+
+ comment: CCC 2024 +
+
+
+
+
+ + ♻ ☆ Inner-approximate Reachability Computation via Zonotopic Boundary + Analysis + + +
+ Inner-approximate reachability analysis involves calculating subsets of +reachable sets, known as inner-approximations. This analysis is crucial in the +fields of dynamic systems analysis and control theory as it provides a reliable +estimation of the set of states that a system can reach from given initial +states at a specific time instant. In this paper, we study the +inner-approximate reachability analysis problem based on the set-boundary +reachability method for systems modelled by ordinary differential equations, in +which the computed inner-approximations are represented with zonotopes. The +set-boundary reachability method computes an inner-approximation by excluding +states reached from the initial set's boundary. The effectiveness of this +method is highly dependent on the efficient extraction of the exact boundary of +the initial set. To address this, we propose methods leveraging boundary and +tiling matrices that can efficiently extract and refine the exact boundary of +the initial set represented by zonotopes. Additionally, we enhance the +exclusion strategy by contracting the outer-approximations in a flexible way, +which allows for the computation of less conservative inner-approximations. To +evaluate the proposed method, we compare it with state-of-the-art methods +against a series of benchmarks. The numerical results demonstrate that our +method is not only efficient but also accurate in computing +inner-approximations. + +
+
+ comment: the extended version of the paper accepted by CAV 2024 +
+
+
+
+
+ + ♻ ☆ Two Choices are Enough for P-LCPs, USOs, and Colorful Tangents + + +
+ We provide polynomial-time reductions between three search problems from +three distinct areas: the P-matrix linear complementarity problem (P-LCP), +finding the sink of a unique sink orientation (USO), and a variant of the +$\alpha$-Ham Sandwich problem. For all three settings, we show that "two +choices are enough", meaning that the general non-binary version of the problem +can be reduced in polynomial time to the binary version. This specifically +means that generalized P-LCPs are equivalent to P-LCPs, and grid USOs are +equivalent to cube USOs. These results are obtained by showing that both the +P-LCP and our $\alpha$-Ham Sandwich variant are equivalent to a new problem we +introduce, P-Lin-Bellman. This problem can be seen as a new tool for +formulating problems as P-LCPs. + +
+
+ comment: 29 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Baby PIH: Parameterized Inapproximability of Min CSP + + +
+ The Parameterized Inapproximability Hypothesis (PIH) is the analog of the PCP +theorem in the world of parameterized complexity. It asserts that no FPT +algorithm can distinguish a satisfiable 2CSP instance from one which is only +$(1-\varepsilon)$-satisfiable (where the parameter is the number of variables) +for some constant $0<\varepsilon<1$. + We consider a minimization version of CSPs (Min-CSP), where one may assign +$r$ values to each variable, and the goal is to ensure that every constraint is +satisfied by some choice among the $r \times r$ pairs of values assigned to its +variables (call such a CSP instance $r$-list-satisfiable). We prove the +following strong parameterized inapproximability for Min CSP: For every $r \ge +1$, it is W[1]-hard to tell if a 2CSP instance is satisfiable or is not even +$r$-list-satisfiable. We refer to this statement as "Baby PIH", following the +recently proved Baby PCP Theorem (Barto and Kozik, 2021). Our proof adapts the +combinatorial arguments underlying the Baby PCP theorem, overcoming some basic +obstacles that arise in the parameterized setting. Furthermore, our reduction +runs in time polynomially bounded in both the number of variables and the +alphabet size, and thus implies the Baby PCP theorem as well. + +
+
+
+
+
+ + ♻ ☆ A criterion for Andrásfai--Erdős--Sós type theorems and + applications + + +
+ The classical Andr\'{a}sfai--Erd\H{o}s--S\'{o}s Theorem states that for +$\ell\ge 2$, every $n$-vertex $K_{\ell+1}$-free graph with minimum degree +greater than $\frac{3\ell-4}{3\ell-1}n$ must be $\ell$-partite. We establish a +simple criterion for $r$-graphs, $r \geq 2$, to exhibit an +Andr\'{a}sfai--Erd\H{o}s--S\'{o}s type property, also known as +degree-stability. This leads to a classification of most previously studied +hypergraph families with this property. An immediate application of this +result, combined with a general theorem by Keevash--Lenz--Mubayi, solves the +spectral Tur\'{a}n problems for a large class of hypergraphs. + For every $r$-graph $F$ with degree-stability, there is a simple algorithm to +decide the $F$-freeness of an $n$-vertex $r$-graph with minimum degree greater +than $(\pi(F) - \varepsilon_F)\binom{n}{r-1}$ in time $O(n^r)$, where +$\varepsilon_F >0$ is a constant. In particular, for the complete graph +$K_{\ell+1}$, we can take $\varepsilon_{K_{\ell+1}} = (3\ell^2-\ell)^{-1}$, and +this bound is tight up to some multiplicative constant factor unless +$\mathbf{W[1]} = \mathbf{FPT}$. Based on a result by Chen--Huang--Kanj--Xia, we +further show that for every fixed $C > 0$, this problem cannot be solved in +time $n^{o(\ell)}$ if we replace $\varepsilon_{K_{\ell+1}}$ with $(C\ell)^{-1}$ +unless $\mathbf{ETH}$ fails. Furthermore, we apply the degree-stability of +$K_{\ell+1}$ to decide the $K_{\ell+1}$-freeness of graphs whose size is close +to the Tur\'{a}n bound in time $(\ell+1)n^2$, partially improving a recent +result by Fomin--Golovach--Sagunov--Simonov. As an intermediate step, we show +that for a specific class of $r$-graphs $F$, the (surjective) $F$-coloring +problem can be solved in time $O(n^r)$, provided the input $r$-graph has $n$ +vertices and a large minimum degree, refining several previous results. + +
+
+ comment: fixed some typos, changed the title, reorganized to enhance + readability for combinatorial readers, comments are welcome +
+
+
+
+
+
+
+
+ + Logic in Computer Science 13 + +
+
+
+ + ☆ Commutative codensity monads and probability bimeasures + + +
+ Several well-studied probability monads have been expressed as codensity +monads over small categories of stochastic maps, giving a limit description of +spaces of probability measures. In this paper we show how properties of +probability monads such as commutativity and affineness can arise from their +codensity presentation. First we show that their codensity presentation is +closely related to another characterisation of probability monads as terminal +endofunctors admitting certain maps into the Giry monad, which allows us to +generalise a result by Van Breugel on the Kantorovich monad. We then provide +sufficient conditions for a codensity monad to lift to $\bf{MonCat}$, and give +a characterisation of exactly pointwise monoidal codensity monads; codensity +monads that satisfy a strengthening of these conditions. We show that the Radon +monad is exactly pointwise monoidal, and hence give a description of the tensor +product of free algebras of the Radon monad in terms of Day convolution. +Finally we show that the Giry monad is not exactly pointwise monoidal due to +the existence of probability bimeasures that do not extend to measures, +although its restriction to standard Borel spaces is. We introduce the notion +of a $*$-monad and its Kleisli monoidal op-multicategory to describe the +categorical structure that organises the spaces of probability polymeasures on +measurable spaces. + +
+
+
+
+
+ + ☆ Centralized vs Decentralized Monitors for Hyperproperties + + +
+ This paper focuses on the runtime verification of hyperproperties expressed +in HypermuHML, an expressive yet simple logic for describing properties of sets +of traces. To this end, we first consider a simple language of monitors that +can observe sets of system executions and report verdicts w.r.t. a given +HypermuHML formula. In this setting, a unique omniscient monitor observes all +system traces, and, in this sense, it is 'centralized'. However, in a possibly +distributed system, having a centralized entity is undesirable; hence, we also +provide a language for 'decentralized' monitors, where each trace has its own +monitor, and monitors for different traces can yield a unique verdict by +communicating their observations. For both the centralized and the +decentralized settings, we provide a synthesis procedure that, given a formula, +yields a monitor that is correct (i.e., sound and violation complete). A key +step in proving the correctness of the synthesis for decentralized monitors is +a result showing that, for each formula, the synthesized centralized monitor +and its corresponding decentralized one are weakly bisimilar for a suitable +notion of weak bisimulation. + +
+
+
+
+
+ + ☆ Getting Wiser from Multiple Data: Probabilistic Updating according to + Jeffrey and Pearl + + +
+ In probabilistic updating one transforms a prior distribution in the light of +given evidence into a posterior distribution, via what is called conditioning, +updating, belief revision or inference. This is the essence of learning, as +Bayesian updating. It will be illustrated via a physical model involving +(adapted) water flows through pipes with different diameters. + Bayesian updating makes us wiser, in the sense that the posterior +distribution makes the evidence more likely than the prior, since it +incorporates the evidence. Things are less clear when one wishes to learn from +multiple pieces of evidence / data. It turns out that there are (at least) two +forms of updating for this, associated with Jeffrey and Pearl. The difference +is not always clearly recognised. + This paper provides an introduction and an overview in the setting of +discrete probability theory. It starts from an elementary question, involving +multiple pieces of evidence, that has been sent to a small group academic +specialists. Their answers show considerable differences. This is used as +motivation and starting point to introduce the two forms of updating, of +Jeffrey and Pearl, for multiple inputs and to elaborate their properties. In +the end the account is related to so-called variational free energy (VFE) +update in the cognitive theory of predictive processing. It is shown that both +Jeffrey and Pearl outperform VFE updating and that VFE updating need not +decrease divergence - that is correct errors - as it is supposed to do. + +
+
+
+
+
+ + ☆ Utilizing Description Logics for Global Explanations of Heterogeneous + Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) are effective for node classification in +graph-structured data, but they lack explainability, especially at the global +level. Current research mainly utilizes subgraphs of the input as local +explanations or generates new graphs as global explanations. However, these +graph-based methods are limited in their ability to explain classes with +multiple sufficient explanations. To provide more expressive explanations, we +propose utilizing class expressions (CEs) from the field of description logic +(DL). Our approach explains heterogeneous graphs with different types of nodes +using CEs in the EL description logic. To identify the best explanation among +multiple candidate explanations, we employ and compare two different scoring +functions: (1) For a given CE, we construct multiple graphs, have the GNN make +a prediction for each graph, and aggregate the predicted scores. (2) We score +the CE in terms of fidelity, i.e., we compare the predictions of the GNN to the +predictions by the CE on a separate validation set. Instead of subgraph-based +explanations, we offer CE-based explanations. + +
+
+
+
+
+ + ☆ RA: A machine based rational agent, Part 1 + + +
+ RA is a software package that couples machine learning with formal reasoning +in an attempt to find the laws that generate the empirical data that it has +been given access to. A brief outline of RA in its initial stage of development +is presented. Particular emphasis is given to current design strategies that +aim to endow RA with the ability to construct its own conjectures of which it +constructs proofs. + +
+
+
+
+
+ + ☆ Inferring Message Flows From System Communication Traces + + +
+ This paper proposes a novel method for automatically inferring message flow +specifications from the communication traces of a system-on-chip (SoC) design +that captures messages exchanged among the components during a system +execution. The inferred message flows characterize the communication and +coordination of components in a system design for realizing various system +functions, and they are essential for SoC validation and debugging. The +proposed method relieves the burden of manual development and maintenance of +such specifications on human designers. Our method also uses a new accuracy +metric, \emph{acceptance ratio}, to evaluate the quality of the mined +specifications instead of the specification size often used in the previous +work, enabling more accurate specifications to be mined. Furthermore, this +paper introduces the concept of essential causalities to enhance the accuracy +of the message flow mining and accelerate the mining process. The effectiveness +of the proposed method is evaluated on both synthetic traces and traces +generated from executing several system models in GEM5. In both cases, the +proposed method achieves superior accuracies compared to a previous approach. +Additionally, this paper includes some practical use cases. + +
+
+
+
+
+ + ♻ ☆ Initial Algebras Unchained -- A Novel Initial Algebra Construction + Formalized in Agda + + +
+ The initial algebra for an endofunctor F provides a recursion and induction +scheme for data structures whose constructors are described by F. The +initial-algebra construction by Ad\'amek (1974) starts with the initial object +(e.g. the empty set) and successively applies the functor until a fixed point +is reached, an idea inspired by Kleene's fixed point theorem. Depending on the +functor of interest, this may require transfinitely many steps indexed by +ordinal numbers until termination. + We provide a new initial algebra construction which is not based on an +ordinal-indexed chain. Instead, our construction is loosely inspired by +Pataraia's fixed point theorem and forms the colimit of all finite recursive +coalgebras. This is reminiscent of the construction of the rational fixed point +of an endofunctor that forms the colimit of all finite coalgebras. For our main +correctness theorem, we assume the given endofunctor is accessible on a (weak +form of) locally presentable category. Our proofs are constructive and fully +formalized in Agda. + +
+
+
+
+
+ + ♻ ☆ Monoidal closure of Grothendieck constructions via $Σ$-tractable + monoidal structures and Dialectica formulas + + +
+ We study the categorical structure of the Grothendieck construction of an +indexed category $\mathcal{L}:\mathcal{C}^{op}\to\mathbf{CAT}$ and characterise +fibred limits, colimits, and monoidal structures. Next, we give sufficient +conditions for the monoidal closure of the total category $\Sigma_\mathcal{C} +\mathcal{L}$ of a Grothendieck construction of an indexed category +$\mathcal{L}:\mathcal{C}^{op}\to\mathbf{CAT}$. Our analysis is a generalization +of G\"odel's Dialectica interpretation, and it relies on a novel notion of +$\Sigma$-tractable monoidal structure. As we will see, $\Sigma$-tractable +coproducts simultaneously generalize cocartesian coclosed structures, +biproducts and extensive coproducts. We analyse when the closed structure is +fibred -- usually it is not. + +
+
+
+
+
+ + ♻ ☆ A Uniform Language to Explain Decision Trees + + +
+ The formal XAI community has studied a plethora of interpretability queries +aiming to understand the classifications made by decision trees. However, a +more uniform understanding of what questions we can hope to answer about these +models, traditionally deemed to be easily interpretable, has remained elusive. +In an initial attempt to understand uniform languages for interpretability, +Arenas et al. (2021) proposed FOIL, a logic for explaining black-box ML models, +and showed that it can express a variety of interpretability queries. However, +we show that FOIL is limited in two important senses: (i) it is not expressive +enough to capture some crucial queries, and (ii) its model agnostic nature +results in a high computational complexity for decision trees. In this paper, +we carefully craft two fragments of first-order logic that allow for +efficiently interpreting decision trees: Q-DT-FOIL and its optimization variant +OPT-DT-FOIL. We show that our proposed logics can express not only a variety of +interpretability queries considered by previous literature, but also elegantly +allows users to specify different objectives the sought explanations should +optimize for. Using finite model-theoretic techniques, we show that the +different ingredients of Q-DT-FOIL are necessary for its expressiveness, and +yet that queries in Q-DT-FOIL can be evaluated with a polynomial number of +queries to a SAT solver, as well as their optimization versions in OPT-DT-FOIL. +Besides our theoretical results, we provide a SAT-based implementation of the +evaluation for OPT-DT-FOIL that is performant on industry-size decision trees. + +
+
+
+
+
+ + ♻ ☆ Executable First-Order Queries in the Logic of Information Flows + + +
+ The logic of information flows (LIF) has recently been proposed as a general +framework in the field of knowledge representation. In this framework, tasks of +procedural nature can still be modeled in a declarative, logic-based fashion. +In this paper, we focus on the task of query processing under limited access +patterns, a well-studied problem in the database literature. We show that LIF +is well-suited for modeling this task. Toward this goal, we introduce a variant +of LIF called "forward" LIF (FLIF), in a first-order setting. FLIF takes a +novel graph-navigational approach; it is an XPath-like language that +nevertheless turns out to be equivalent to the "executable" fragment of +first-order logic defined by Nash and Lud\"ascher. One can also classify the +variables in FLIF expressions as inputs and outputs. Expressions where inputs +and outputs are disjoint, referred to as io-disjoint FLIF expressions, allow a +particularly transparent translation into algebraic query plans that respect +the access limitations. Finally, we show that general FLIF expressions can +always be put into io-disjoint form. + +
+
+
+
+
+ + ♻ ☆ Learning Explainable and Better Performing Representations of POMDP + Strategies + + +
+ Strategies for partially observable Markov decision processes (POMDP) +typically require memory. One way to represent this memory is via automata. We +present a method to learn an automaton representation of a strategy using a +modification of the L*-algorithm. Compared to the tabular representation of a +strategy, the resulting automaton is dramatically smaller and thus also more +explainable. Moreover, in the learning process, our heuristics may even improve +the strategy's performance. In contrast to approaches that synthesize an +automaton directly from the POMDP thereby solving it, our approach is +incomparably more scalable. + +
+
+ comment: Technical report for the submission to TACAS 24 +
+
+
+
+
+ + ♻ ☆ Note on a Translation from First-Order Logic into the Calculus of + Relations Preserving Validity and Finite Validity + + +
+ In this note, we give a linear-size translation from formulas of first-order +logic into equations of the calculus of relations preserving validity and +finite validity. Our translation also gives a linear-size conservative +reduction from formulas of first-order logic into formulas of the +three-variable fragment of first-order logic. + +
+
+
+
+
+ + ♻ ☆ Coinductive Streams in Monoidal Categories + + +
+ We introduce monoidal streams. Monoidal streams are a generalization of +causal stream functions, which can be defined in cartesian monoidal categories, +to arbitrary symmetric monoidal categories. In the same way that streams +provide semantics to dataflow programming with pure functions, monoidal streams +provide semantics to dataflow programming with theories of processes +represented by a symmetric monoidal category. Monoidal streams also form a +feedback monoidal category. In the same way that we can use a coinductive +stream calculus to reason about signal flow graphs, we can use coinductive +string diagrams to reason about feedback monoidal categories. As an example, we +study syntax for a stochastic dataflow language, with semantics in stochastic +monoidal streams. + +
+
+ comment: Expanded version of Monoidal Streams for Dataflow Programming, + arXiv:2202.02061. We thank the reviewers at LMCS for multiple suggestions + that have improved this version. 57 pages, 33 figures +
+
+
+
+
+
+
+
+ + Hardware Architecturea 6 + +
+
+
+ + ☆ Training and inference in the ReckON RSNN architecture implemented on a + MPSoC + + +
+ With the rise of artificial intelligence, biological neuron models are being +used to implement neural networks that can learn certain tasks after a training +phase. One type of such networks are spiking neural networks (SNNs) that rely +on a simplified model for biological neurons, the Integrate and Fire neuron. +Several accelerators have emerged to implement SNNs with this kind of neuron. +The ReckON system is one of these that allows both the training and execution +of a recurrent SNN. The ReckON architecture, implemented on a custom ASIC, can +be fully described using a hardware description language. In this work, we +adapt the Verilog description to implement it on a Xilinx Multiprocessor System +on Chip system (MPSoC). We present the circuits required for the efficient +operation of the system, and a Python framework to use it on the Pynq ZU +platform. We validate the architecture and implementation in two different +scenarios, and show how the simulated accuracy is preserved with a peak +performance of 3.8M events processed per second. + +
+
+ comment: Under review at ICECS'24 +
+
+
+
+
+ + ☆ Automating Attendance Management in Human Resources: A Design Science + Approach Using Computer Vision and Facial Recognition + + +
+ Haar Cascade is a cost-effective and user-friendly machine learning-based +algorithm for detecting objects in images and videos. Unlike Deep Learning +algorithms, which typically require significant resources and expensive +computing costs, it uses simple image processing techniques like edge detection +and Haar features that are easy to comprehend and implement. By combining Haar +Cascade with OpenCV2 on an embedded computer like the NVIDIA Jetson Nano, this +system can accurately detect and match faces in a database for attendance +tracking. This system aims to achieve several specific objectives that set it +apart from existing solutions. It leverages Haar Cascade, enriched with +carefully selected Haar features, such as Haar-like wavelets, and employs +advanced edge detection techniques. These techniques enable precise face +detection and matching in both images and videos, contributing to high accuracy +and robust performance. By doing so, it minimizes manual intervention and +reduces errors, thereby strengthening accountability. Additionally, the +integration of OpenCV2 and the NVIDIA Jetson Nano optimizes processing +efficiency, making it suitable for resource-constrained environments. This +system caters to a diverse range of educational institutions, including +schools, colleges, vocational training centers, and various workplace settings +such as small businesses, offices, and factories. ... The system's +affordability and efficiency democratize attendance management technology, +making it accessible to a broader audience. Consequently, it has the potential +to transform attendance tracking and management practices, ultimately leading +to heightened productivity and accountability. In conclusion, this system +represents a groundbreaking approach to attendance tracking and management... + +
+
+ comment: 31 pages, accepted to publish by the International Journal of + Information Management Data Insights (IJIMDS) in 2024 +
+
+
+
+
+ + ☆ FEATHER: A Reconfigurable Accelerator with Data Reordering Support for + Low-Cost On-Chip Dataflow Switching ISCA + + +
+ The inference of ML models composed of diverse structures, types, and sizes +boils down to the execution of different dataflows (i.e. different tiling, +ordering, parallelism, and shapes). Using the optimal dataflow for every layer +of workload can reduce latency by up to two orders of magnitude over a +suboptimal dataflow. Unfortunately, reconfiguring hardware for different +dataflows involves on-chip data layout reordering and datapath +reconfigurations, leading to non-trivial overhead that hinders ML accelerators +from exploiting different dataflows, resulting in suboptimal performance. To +address this challenge, we propose FEATHER, an innovative accelerator that +leverages a novel spatial array termed Nest and a novel multi-stage reduction +network called BIRRD for performing flexible data reduction with layout +reordering under the hood, enabling seamless switching between optimal +dataflows with negligible latency and resources overhead. For systematically +evaluating the performance interaction between dataflows and layouts, we +enhance Timeloop, a state-of-the-art dataflow cost modeling and search +framework, with layout assessment capabilities, and term it as Layoutloop. We +model FEATHER into Layoutloop and also deploy FEATHER end-to-end on the edge +ZCU104 FPGA. FEATHER delivers 1.27~2.89x inference latency speedup and +1.3~6.43x energy efficiency improvement compared to various SoTAs like NVDLA, +SIGMA and Eyeriss under ResNet-50 and MobiletNet-V3 in Layoutloop. On practical +FPGA devices, FEATHER achieves 2.65/3.91x higher throughput than Xilinx +DPU/Gemmini. Remarkably, such performance and energy efficiency enhancements +come at only 6% area over a fixed-dataflow Eyeriss-like accelerator. Our code +is released at https://github.com/maeri-project/FEATHER. + +
+
+ comment: 17 pages, 14 figures. International Symposium on Computer + Architecture (ISCA), Jun 2024 +
+
+
+
+
+ + ♻ ☆ Merits of Time-Domain Computing for VMM -- A Quantitative Comparison + + +
+ Vector-matrix-multiplication (VMM) accel-erators have gained a lot of +traction, especially due to therise of convolutional neural networks (CNNs) and +the desireto compute them on the edge. Besides the classical digitalapproach, +analog computing has gone through a renais-sance to push energy efficiency +further. A more recent ap-proach is called time-domain (TD) computing. In +contrastto analog computing, TD computing permits easy technol-ogy as well as +voltage scaling. As it has received limitedresearch attention, it is not yet +clear which scenarios aremost suitable to be computed in the TD. In this work, +weinvestigate these scenarios, focussing on energy efficiencyconsidering +approximative computations that preserve ac-curacy. Both goals are addressed by +a novel efficiency met-ric, which is used to find a baseline design. We use +SPICEsimulation data which is fed into a python framework toevaluate how +performance scales for VMM computation.We see that TD computing offers best +energy efficiency forsmall to medium sized arrays. With throughput and sili-con +footprint we investigate two additional metrics, givinga holistic comparison. + +
+
+ comment: 8 pages, 12 figures. This paper was accepted at the 25th + International Symposium on Quality Electronic Design(ISQED) 2024. DOI: + 10.1109/ISQED60706.2024.10528682 +
+
+
+
+
+ + ♻ ☆ Exploring the Design Space for Message-Driven Systems for Dynamic Graph + Processing using CCA + + +
+ Computer systems that have been successfully deployed for dense regular +workloads fall short of achieving scalability and efficiency when applied to +irregular and dynamic graph applications. Conventional computing systems rely +heavily on static, regular, numeric intensive computations while High +Performance Computing systems executing parallel graph applications exhibit +little locality, spatial or temporal, and are fine-grained and memory +intensive. With the strong interest in AI which depend on these very different +use cases combined with the end of Moore's Law at nanoscale, dramatic +alternatives in architecture and underlying execution models are required. This +paper identifies an innovative non-von Neumann architecture, Continuum Computer +Architecture (CCA), that redefines the nature of computing structures to yield +powerful innovations in computational methods to deliver a new generation of +highly parallel hardware architecture. CCA reflects a genus of highly parallel +architectures that while varying in specific quantities (e.g., memory blocks), +share a multiple of attributes not found in typical von Neumann machines. Among +these are memory-centric components, message-driven asynchronous flow control, +and lightweight out-of-order execution across a global name space. Together +these innovative non-von Neumann architectural properties guided by a new +original execution model will deliver the new future path for extending beyond +the von Neumann model. This paper documents a series of interrelated +experiments that together establish future directions for next generation +non-von Neumann architectures, especially for graph processing. + +
+
+
+
+
+ + ♻ ☆ Energy-efficiency Limits on Training AI Systems using Learning-in-Memory + + +
+ Learning-in-memory (LIM) is a recently proposed paradigm to overcome +fundamental memory bottlenecks in training machine learning systems. While +compute-in-memory (CIM) approaches can address the so-called memory-wall (i.e. +energy dissipated due to repeated memory read access) they are agnostic to the +energy dissipated due to repeated memory writes at the precision required for +training (the update-wall), and they don't account for the energy dissipated +when transferring information between short-term and long-term memories (the +consolidation-wall). The LIM paradigm proposes that these bottlenecks, too, can +be overcome if the energy barrier of physical memories is adaptively modulated +such that the dynamics of memory updates and consolidation match the Lyapunov +dynamics of gradient-descent training of an AI model. In this paper, we derive +new theoretical lower bounds on energy dissipation when training AI systems +using different LIM approaches. The analysis presented here is model-agnostic +and highlights the trade-off between energy efficiency and the speed of +training. The resulting non-equilibrium energy-efficiency bounds have a similar +flavor as that of Landauer's energy-dissipation bounds. We also extend these +limits by taking into account the number of floating-point operations (FLOPs) +used for training, the size of the AI model, and the precision of the training +parameters. Our projections suggest that the energy-dissipation lower-bound to +train a brain scale AI system (comprising of $10^{15}$ parameters) using LIM is +$10^8 \sim 10^9$ Joules, which is on the same magnitude the Landauer's +adiabatic lower-bound and $6$ to $7$ orders of magnitude lower than the +projections obtained using state-of-the-art AI accelerator hardware +lower-bounds. + +
+
+ comment: 23 pages, 7 figures +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 1 + +
+
+
+ + ☆ Is decidability of the Submonoid Membership Problem closed under finite + extensions? + + +
+ We show that the rational subset membership problem in $G$ can be reduced to +the submonoid membership problem in $G{\times}H$ where $H$ is virtually +Abelian. We use this to show that there is no algorithm reducing submonoid +membership to a finite index subgroup uniformly for all virtually nilpotent +groups. We also provide evidence towards the existence of a group $G$ with a +subgroup $H +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Formal Languages and Automata Theory 4 + +
+
+
+ + ☆ Jumping Automata Must Pay + + +
+ Jumping automata are finite automata that read their input in a +non-sequential manner, by allowing a reading head to ``jump'' between positions +on the input, consuming a permutation of the input word. We argue that allowing +the head to jump should incur some cost. To this end, we propose three +quantitative semantics for jumping automata, whereby the jumps of the head in +an accepting run define the cost of the run. The three semantics correspond to +different interpretations of jumps: the \emph{absolute distance} semantics +counts the distance the head jumps, the \emph{reversal} semantics counts the +number of times the head changes direction, and the \emph{Hamming distance} +measures the number of letter-swaps the run makes. + We study these measures, with the main focus being the \emph{boundedness +problem}: given a jumping automaton, decide whether its (quantitative) +languages is bounded by some given number $k$. We establish the decidability +and complexity for this problem under several variants. + +
+
+
+
+
+ + ☆ A framework for extraction and transformation of documents + + +
+ We present a theoretical framework for the extraction and transformation of +text documents. We propose to use a two-phase process where the first phase +extracts span-tuples from a document, and the second phase maps the content of +the span-tuples into new documents. We base the extraction phase on the +framework of document spanners and the transformation phase on the theory of +polyregular functions, the class of regular string-to-string functions with +polynomial growth. + For supporting practical extract-transform scenarios, we propose an extension +of document spanners described by regex formulas from span-tuples to so-called +multispan-tuples, where variables are mapped to sets of spans. We prove that +this extension, called regex multispanners, has the same desirable properties +as standard spanners described by regex formulas. In our framework, an +Extract-Transform (ET) program is given by a regex multispanner followed by a +polyregular function. + In this paper, we study the expressibility and evaluation problem of ET +programs when the transformation function is linear, called linear ET programs. +We show that linear ET programs are equally expressive as non-deterministic +streaming string transducers under bag semantics. Moreover, we show that linear +ET programs are closed under composition. Finally, we present an enumeration +algorithm for evaluating every linear ET program over a document with linear +time preprocessing and constant delay. + +
+
+
+
+
+ + ♻ ☆ Dynamic Programming for Symbolic Boolean Realizability and Synthesis + + +
+ Inspired by recent progress in dynamic programming approaches for weighted +model counting, we investigate a dynamic-programming approach in the context of +boolean realizability and synthesis, which takes a conjunctive-normal-form +boolean formula over input and output variables, and aims at synthesizing +witness functions for the output variables in terms of the inputs. We show how +graded project-join trees, obtained via tree decomposition, can be used to +compute a BDD representing the realizability set for the input formulas in a +bottom-up order. We then show how the intermediate BDDs generated during +realizability checking phase can be applied to synthesizing the witness +functions in a top-down manner. An experimental evaluation of a solver -- +DPSynth -- based on these ideas demonstrates that our approach for Boolean +realizabilty and synthesis has superior time and space performance over a +heuristics-based approach using same symbolic representations. We discuss the +advantage on scalability of the new approach, and also investigate our findings +on the performance of the DP framework. + +
+
+ comment: 32 pages including appendices and bibliography, 5 figures, paper is + to be published in CAV 2024, but this version is inclusive of the Appendix +
+
+
+
+
+ + ♻ ☆ Learning Deterministic Multi-Clock Timed Automata SC + + +
+ We present an algorithm for active learning of deterministic timed automata +with multiple clocks. The algorithm is within the querying framework of +Angluin's $L^*$ algorithm and follows the idea proposed in existing work on the +active learning of deterministic one-clock timed automata. We introduce an +equivalence relation over the reset-clocked language of a timed automaton and +then transform the learning problem into learning the corresponding +reset-clocked language of the target automaton. Since a reset-clocked language +includes the clock reset information which is not observable, we first present +the approach of learning from a powerful teacher who can provide reset +information by answering reset information queries from the learner. Then we +extend the algorithm in a normal teacher situation in which the learner can +only ask standard membership query and equivalence query while the learner +guesses the reset information. We prove that the learning algorithm terminates +and returns a correct deterministic timed automaton. Due to the need of +guessing whether the clocks reset at the transitions, the algorithm is of +exponential complexity in the size of the target automaton. + +
+
+ comment: 20 pages. It is an author version of the paper with the same title + accepted by HSCC 2024 +
+
+
+
+
+
+
+
+ + Logic in Computer Science 13 + +
+
+
+ + ☆ Deciding branching hyperproperties for real time systems + + +
+ Security properties of real-time systems often involve reasoning about +hyper-properties, as opposed to properties of single executions or trees of +executions. These hyper-properties need to additionally be expressive enough to +reason about real-time constraints. Examples of such properties include +information flow, side channel attacks and service-level agreements. In this +paper we study computational problems related to a branching-time, +hyper-property extension of metric temporal logic (MTL) that we call HCMTL*. We +consider both the interval-based and point-based semantics of this logic. The +verification problem that we consider is to determine if a given HCMTL* formula +$\varphi$ is true in a system represented by a timed automaton. We show that +this problem is undecidable. We then show that the verification problem is +decidable if we consider executions upto a fixed time horizon $T$. Our +decidability result relies on reducing the verification problem to the truth of +an MSO formula over reals with a bounded time interval. + +
+
+
+
+
+ + ☆ Increasing the LLM Accuracy for Question Answering: Ontologies to the + Rescue! + + +
+ There is increasing evidence that question-answering (QA) systems with Large +Language Models (LLMs), which employ a knowledge graph/semantic representation +of an enterprise SQL database (i.e. Text-to-SPARQL), achieve higher accuracy +compared to systems that answer questions directly on SQL databases (i.e. +Text-to-SQL). Our previous benchmark research showed that by using a knowledge +graph, the accuracy improved from 16% to 54%. The question remains: how can we +further improve the accuracy and reduce the error rate? Building on the +observations of our previous research where the inaccurate LLM-generated SPARQL +queries followed incorrect paths, we present an approach that consists of 1) +Ontology-based Query Check (OBQC): detects errors by leveraging the ontology of +the knowledge graph to check if the LLM-generated SPARQL query matches the +semantic of ontology and 2) LLM Repair: use the error explanations with an LLM +to repair the SPARQL query. Using the chat with the data benchmark, our primary +finding is that our approach increases the overall accuracy to 72% including an +additional 8% of "I don't know" unknown results. Thus, the overall error rate +is 20%. These results provide further evidence that investing knowledge graphs, +namely the ontology, provides higher accuracy for LLM powered question +answering systems. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Using Color Refinement to Boost Enumeration and Counting for Acyclic CQs + of Binary Schemas + + +
+ We present an index structure, called the color-index, to boost the +evaluation of acyclic conjunctive queries (ACQs) over binary schemas. The +color-index is based on the color refinement algorithm, a widely used +subroutine for graph isomorphism testing algorithms. Given a database $D$, we +use a suitable version of the color refinement algorithm to produce a stable +coloring of $D$, an assignment from the active domain of $D$ to a set of colors +$C_D$. The main ingredient of the color-index is a particular database $D_c$ +whose active domain is $C_D$ and whose size is at most $|D|$. Using the +color-index, we can evaluate any free-connex ACQ $Q$ over $D$ with +preprocessing time $O(|Q| \cdot |D_c|)$ and constant delay enumeration. +Furthermore, we can also count the number of results of $Q$ over $D$ in time +$O(|Q| \cdot |D_c|)$. Given that $|D_c|$ could be much smaller than $|D|$ (even +constant-size for some families of databases), the color-index is the first +index structure for evaluating free-connex ACQs that allows efficient +enumeration and counting with performance that may be strictly smaller than the +database size. + +
+
+
+
+
+ + ♻ ☆ Preservation theorems on sparse classes revisited + + +
+ We revisit the work studying homomorphism preservation for first-order logic +in sparse classes of structures initiated in [Atserias et al., JACM 2006] and +[Dawar, JCSS 2010]. These established that first-order logic has the +homomorphism preservation property in any sparse class that is monotone and +addable. It turns out that the assumption of addability is not strong enough +for the proofs given. We demonstrate this by constructing classes of graphs of +bounded treewidth which are monotone and addable but fail to have homomorphism +preservation. We also show that homomorphism preservation fails on the class of +planar graphs. On the other hand, the proofs of homomorphism preservation can +be recovered by replacing addability by a stronger condition of amalgamation +over bottlenecks. This is analogous to a similar condition formulated for +extension preservation in [Atserias et al., SiCOMP 2008]. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Efficient Implementation of an Abstract Domain of Quantified First-Order + Formulas + + +
+ This paper lays a practical foundation for using abstract interpretation with +an abstract domain that consists of sets of quantified first-order logic +formulas. This abstract domain seems infeasible at first sight due to the +complexity of the formulas involved and the enormous size of sets of formulas +(abstract elements). We introduce an efficient representation of abstract +elements, which eliminates redundancies based on a novel syntactic subsumption +relation that under-approximates semantic entailment. We develop algorithms and +data structures to efficiently compute the join of an abstract element with the +abstraction of a concrete state, operating on the representation of abstract +elements. To demonstrate feasibility of the domain, we use our data structures +and algorithms to implement a symbolic abstraction algorithm that computes the +least fixpoint of the best abstract transformer of a transition system, which +corresponds to the strongest inductive invariant. We succeed at finding, for +example, the least fixpoint for Paxos (which in our representation has 1,438 +formulas with $\forall^*\exists^*\forall^*$ quantification) in time comparable +to state-of-the-art property-directed approaches. + +
+
+
+
+
+ + ♻ ☆ Initial Algebras Unchained -- A Novel Initial Algebra Construction + Formalized in Agda + + +
+ The initial algebra for an endofunctor F provides a recursion and induction +scheme for data structures whose constructors are described by F. The +initial-algebra construction by Ad\'amek (1974) starts with the initial object +(e.g. the empty set) and successively applies the functor until a fixed point +is reached, an idea inspired by Kleene's fixed point theorem. Depending on the +functor of interest, this may require transfinitely many steps indexed by +ordinal numbers until termination. + We provide a new initial algebra construction which is not based on an +ordinal-indexed chain. Instead, our construction is loosely inspired by +Pataraia's fixed point theorem and forms the colimit of all finite recursive +coalgebras. This is reminiscent of the construction of the rational fixed point +of an endofunctor that forms the colimit of all finite coalgebras. For our main +correctness theorem, we assume the given endofunctor is accessible on a (weak +form of) locally presentable category. Our proofs are constructive and fully +formalized in Agda. + +
+
+
+
+
+ + ♻ ☆ Dynamic Programming for Symbolic Boolean Realizability and Synthesis + + +
+ Inspired by recent progress in dynamic programming approaches for weighted +model counting, we investigate a dynamic-programming approach in the context of +boolean realizability and synthesis, which takes a conjunctive-normal-form +boolean formula over input and output variables, and aims at synthesizing +witness functions for the output variables in terms of the inputs. We show how +graded project-join trees, obtained via tree decomposition, can be used to +compute a BDD representing the realizability set for the input formulas in a +bottom-up order. We then show how the intermediate BDDs generated during +realizability checking phase can be applied to synthesizing the witness +functions in a top-down manner. An experimental evaluation of a solver -- +DPSynth -- based on these ideas demonstrates that our approach for Boolean +realizabilty and synthesis has superior time and space performance over a +heuristics-based approach using same symbolic representations. We discuss the +advantage on scalability of the new approach, and also investigate our findings +on the performance of the DP framework. + +
+
+ comment: 32 pages including appendices and bibliography, 5 figures, paper is + to be published in CAV 2024, but this version is inclusive of the Appendix +
+
+
+
+
+ + ♻ ☆ Solving promise equations over monoids and groups + + +
+ We give a complete complexity classification for the problem of finding a +solution to a given system of equations over a fixed finite monoid, given that +a solution over a more restricted monoid exists. As a corollary, we obtain a +complexity classification for the same problem over groups. + +
+
+ comment: Full version of an ICALP 2024 paper +
+
+
+
+
+ + ♻ ☆ Scoped Effects as Parameterized Algebraic Theories + + +
+ Notions of computation can be modelled by monads. Algebraic effects offer a +characterization of monads in terms of algebraic operations and equational +axioms, where operations are basic programming features, such as reading or +updating the state, and axioms specify observably equivalent expressions. +However, many useful programming features depend on additional mechanisms such +as delimited scopes or dynamically allocated resources. Such mechanisms can be +supported via extensions to algebraic effects including scoped effects and +parameterized algebraic theories. We present a fresh perspective on scoped +effects by translation into a variation of parameterized algebraic theories. +The translation enables a new approach to equational reasoning for scoped +effects and gives rise to an alternative characterization of monads in terms of +generators and equations involving both scoped and algebraic operations. We +demonstrate the power of our fresh perspective by way of equational +characterizations of several known models of scoped effects. + +
+
+ comment: Extended version of the ESOP 2024 paper with the same title +
+
+
+
+
+ + ♻ ☆ Marabou 2.0: A Versatile Formal Analyzer of Neural Networks + + +
+ This paper serves as a comprehensive system description of version 2.0 of the +Marabou framework for formal analysis of neural networks. We discuss the tool's +architectural design and highlight the major features and components introduced +since its initial release. + +
+
+ comment: Condensed version accepted at CAV'24 +
+
+
+
+
+ + ♻ ☆ On the Decidability of Monadic Second-Order Logic with Arithmetic + Predicates + + +
+ We investigate the decidability of the monadic second-order (MSO) theory of +the structure $\langle \mathbb{N};<,P_1, \ldots,P_k \rangle$, for various unary +predicates $P_1,\ldots,P_k \subseteq \mathbb{N}$. We focus in particular on +"arithmetic" predicates arising in the study of linear recurrence sequences, +such as fixed-base powers $\mathsf{Pow}_k = \{k^n : n \in \mathbb{N}\}$, $k$-th +powers $\mathsf{N}_k = \{n^k : n \in \mathbb{N}\}$, and the set of terms of the +Fibonacci sequence $\mathsf{Fib} = \{0,1,2,3,5,8,13,\ldots\}$ (and similarly +for other linear recurrence sequences having a single, non-repeated, dominant +characteristic root). We obtain several new unconditional and conditional +decidability results, a select sample of which are the following: + $\bullet$ The MSO theory of $\langle \mathbb{N};<,\mathsf{Pow}_2, +\mathsf{Fib} \rangle$ is decidable; + $\bullet$ The MSO theory of $\langle \mathbb{N};<, \mathsf{Pow}_2, +\mathsf{Pow}_3, \mathsf{Pow}_6 \rangle$ is decidable; + $\bullet$ The MSO theory of $\langle \mathbb{N};<, \mathsf{Pow}_2, +\mathsf{Pow}_3, \mathsf{Pow}_5 \rangle$ is decidable assuming Schanuel's +conjecture; + $\bullet$ The MSO theory of $\langle \mathbb{N};<, \mathsf{Pow}_4, +\mathsf{N}_2 \rangle$ is decidable; + $\bullet$ The MSO theory of $\langle \mathbb{N};<, \mathsf{Pow}_2, +\mathsf{N}_2 \rangle$ is Turing-equivalent to the MSO theory of $\langle +\mathbb{N};<,S \rangle$, where $S$ is the predicate corresponding to the binary +expansion of $\sqrt{2}$. (As the binary expansion of $\sqrt{2}$ is widely +believed to be normal, the corresponding MSO theory is in turn expected to be +decidable.) + These results are obtained by exploiting and combining techniques from +dynamical systems, number theory, and automata theory. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ Embedded Finite Models Beyond Restricted Quantifier Collapse + + +
+ We revisit evaluation of logical formulas that allow both uninterpreted +relations, constrained to be finite, as well as an interpreted vocabulary over +an infinite domain. This formalism was denoted embedded finite model theory in +the past. + It is clear that the expressiveness and evaluating complexity of formulas of +this type depends heavily on the infinite structure. If we embed in a wild +structure like the integers with additive and multiplicative arithmetic, logic +is extremely expressive and formulas are impossible to evaluate. On the other +hand, for some well-known decidable structures, the expressiveness and +evaluating complexity are similar to the situation without the additional +infrastructure. The latter phenomenon was formalized via the notion of +``Restricted Quantifier Collapse'': adding quantification over the infinite +structure does not add expressiveness. Beyond these two extremes little was +known. + In this work we show that the possibilities for expressiveness and complexity +are much wider. We show that we can get almost any possible complexity of +evaluation while staying within a decidable structure. We also show that in +some decidable structures, there is a disconnect between expressiveness of the +logic and complexity, in that we cannot eliminate quantification over the +structure, but this is not due to an ability to embed complex relational +computation in the logic. + We show failure of collapse for the theory of finite fields and the related +theory of pseudo-finite fields, which will involve coding computation in the +logic. As a by-product of this, we establish new lower-bounds for the +complexity of decision procedures for several decidable theories of fields, +including the theory of finite fields. + In the process of investigating this landscape, we investigate several +weakenings of collapse. + +
+
+
+
+
+ + ♻ ☆ Fair Asynchronous Session Subtyping + + +
+ Session types are widely used as abstractions of asynchronous message passing +systems. Refinement for such abstractions is crucial as it allows improvements +of a given component without compromising its compatibility with the rest of +the system. In the context of session types, the most general notion of +refinement is asynchronous session subtyping, which allows message emissions to +be anticipated w.r.t. a bounded amount of message consumptions. In this paper +we investigate the possibility to anticipate emissions w.r.t. an unbounded +amount of consumptions: to this aim we propose to consider fair compliance over +asynchronous session types and fair refinement as the relation that preserves +it. This allows us to propose a novel variant of session subtyping that +leverages the notion of controllability from service contract theory and that +is a sound characterisation of fair refinement. In addition, we show that both +fair refinement and our novel subtyping are undecidable. We also present a +sound algorithm which deals with examples that feature potentially unbounded +buffering. Finally, we present an implementation of our algorithm and an +empirical evaluation of it on synthetic benchmarks. + +
+
+
+
+
+
+
+
+ + Hardware Architecturea 6 + +
+
+
+ + ☆ Using Formal Verification to Evaluate Single Event Upsets in a RISC-V + Core + + +
+ Reliability has been a major concern in embedded systems. Higher transistor +density and lower voltage supply increase the vulnerability of embedded systems +to soft errors. A Single Event Upset (SEU), which is also called a soft error, +can reverse a bit in a sequential element, resulting in a system failure. +Simulation-based fault injection has been widely used to evaluate reliability, +as suggested by ISO26262. However, it is practically impossible to test all +faults for a complex design. Random fault injection is a compromise that +reduces accuracy and fault coverage. Formal verification is an alternative +approach. In this paper, we use formal verification, in the form of model +checking, to evaluate the hardware reliability of a RISC-V Ibex Core in the +presence of soft errors. Backward tracing is performed to identify and +categorize faults according to their effects (no effect, Silent Data +Corruption, crashes, and hangs). By using formal verification, the entire state +space and fault list can be exhaustively explored. It is found that misaligned +instructions can amplify fault effects. It is also found that some bits are +more vulnerable to SEUs than others. In general, most of the bits in the Ibex +Core are vulnerable to Silent Data Corruption, and the second pipeline stage is +more vulnerable to Silent Data Corruption than the first. + +
+
+
+
+
+ + ☆ NeRTCAM: CAM-Based CMOS Implementation of Reference Frames for + Neuromorphic Processors + + +
+ Neuromorphic architectures mimicking biological neural networks have been +proposed as a much more efficient alternative to conventional von Neumann +architectures for the exploding compute demands of AI workloads. Recent +neuroscience theory on intelligence suggests that Cortical Columns (CCs) are +the fundamental compute units in the neocortex and intelligence arises from +CC's ability to store, predict and infer information via structured Reference +Frames (RFs). Based on this theory, recent works have demonstrated brain-like +visual object recognition using software simulation. Our work is the first +attempt towards direct CMOS implementation of Reference Frames for building +CC-based neuromorphic processors. We propose NeRTCAM (Neuromorphic Reverse +Ternary Content Addressable Memory), a CAM-based building block that supports +the key operations (store, predict, infer) required to perform inference using +RFs. NeRTCAM architecture is presented in detail including its key components. +All designs are implemented in SystemVerilog and synthesized in 7nm CMOS, and +hardware complexity scaling is evaluated for varying storage sizes. NeRTCAM +system for biologically motivated MNIST inference with a storage size of 1024 +entries incurs just 0.15 mm^2 area, 400 mW power and 9.18 us critical path +latency, demonstrating the feasibility of direct CMOS implementation of +CAM-based Reference Frames. + +
+
+ comment: Accepted and Presented at Neuro-Inspired Computational Elements + (NICE) Conference, La Jolla, CA. 2024 +
+
+
+
+
+ + Automatic Hardware Pragma Insertion in High-Level Synthesis: A + Non-Linear Programming Approach + + +
+ High-level synthesis, source-to-source compilers, and various Design Space +Exploration techniques for pragma insertion have significantly improved the +Quality of Results of generated designs. These tools offer benefits such as +reduced development time and enhanced performance. However, achieving +high-quality results often requires additional manual code transformations and +tiling selections, which are typically performed separately or as +pre-processing steps. Although DSE techniques enable code transformation +upfront, the vastness of the search space often limits the exploration of all +possible code transformations, making it challenging to determine which +transformations are necessary. Additionally, ensuring correctness remains +challenging, especially for complex transformations and optimizations. + To tackle this obstacle, we first propose a comprehensive framework +leveraging HLS compilers. Our system streamlines code transformation, pragma +insertion, and tiles size selection for on-chip data caching through a unified +optimization problem, aiming to enhance parallelization, particularly +beneficial for computation-bound kernels. Them employing a novel Non-Linear +Programming (NLP) approach, we simultaneously ascertain transformations, +pragmas, and tile sizes, focusing on regular loop-based kernels. Our evaluation +demonstrates that our framework adeptly identifies the appropriate +transformations, including scenarios where no transformation is necessary, and +inserts pragmas to achieve a favorable Quality of Results. + +
+
+
+
+
+ + ♻ ☆ Splitwise: Efficient generative LLM inference using phase splitting + + +
+ Recent innovations in generative large language models (LLMs) have made their +applications and use-cases ubiquitous. This has led to large-scale deployments +of these models, using complex, expensive, and power-hungry AI accelerators, +most commonly GPUs. These developments make LLM inference efficiency an +important challenge. Based on our extensive characterization, we find that +there are two main phases during an LLM inference request: a compute-intensive +prompt computation, and a memory-intensive token generation, each with distinct +latency, throughput, memory, and power characteristics. Despite +state-of-the-art batching and scheduling, the token generation phase +underutilizes compute resources. Specifically, unlike compute-intensive prompt +computation phases, token generation phases do not require the compute +capability of the latest GPUs, and can be run with lower power and cost. + With Splitwise, we propose splitting the two phases of a LLM inference +request on to separate machines. This allows us to use hardware that is +well-suited for each phase, and provision resources independently per phase. +However, splitting an inference request across machines requires state transfer +from the machine running prompt computation over to the machine generating +tokens. We implement and optimize this state transfer using the fast back-plane +interconnects available in today's GPU clusters. + We use the Splitwise technique to design LLM inference clusters using the +same or different types of machines for the prompt computation and token +generation phases. Our clusters are optimized for three key objectives: +throughput, cost, and power. In particular, we show that we can achieve 1.4x +higher throughput at 20% lower cost than current designs. Alternatively, we can +achieve 2.35x more throughput with the same cost and power budgets. + +
+
+ comment: 12 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ RTLFixer: Automatically Fixing RTL Syntax Errors with Large Language + Models + + +
+ This paper presents RTLFixer, a novel framework enabling automatic syntax +errors fixing for Verilog code with Large Language Models (LLMs). Despite LLM's +promising capabilities, our analysis indicates that approximately 55% of errors +in LLM-generated Verilog are syntax-related, leading to compilation failures. +To tackle this issue, we introduce a novel debugging framework that employs +Retrieval-Augmented Generation (RAG) and ReAct prompting, enabling LLMs to act +as autonomous agents in interactively debugging the code with feedback. This +framework demonstrates exceptional proficiency in resolving syntax errors, +successfully correcting about 98.5% of compilation errors in our debugging +dataset, comprising 212 erroneous implementations derived from the VerilogEval +benchmark. Our method leads to 32.3% and 10.1% increase in pass@1 success rates +in the VerilogEval-Machine and VerilogEval-Human benchmarks, respectively. + +
+
+
+
+
+ + ♻ ☆ Chiplet Cloud: Building AI Supercomputers for Serving Large Generative + Language Models + + +
+ Large language models (LLMs) such as OpenAI's ChatGPT and Google's Gemini +have demonstrated unprecedented capabilities of autoregressive AI models across +multiple tasks triggering disruptive technology innovations around the world. +However, as models continue to grow the cost to serve these models also +continues to grow threatening the democratization of LLMs. + To address this issue, we propose Chiplet Cloud, a chiplet-based ASIC +LLM-supercomputer architecture whose goal is to optimize the total cost of +ownership (TCO) per generated token. This architecture is a highly +parameterizable ASIC and server-level architecture leveraging thousands of +replicated accelerator modules collaborating to scale-up the performance of +LLMs at cloud-scale. To determine specific parameterizations of the Chiplet +Cloud architecture, we implemented a two-phase hardware-software co-design +methodology that can search the massive design space and fine tune the +architecture across a collection of LLMs based on an accurate inference +simulation. A common bottleneck for LLMs is the memory access performance +therefore we introduce CC-MEM, a scalable on-chip memory system for Chiplet +Cloud architectures. Using the CC-MEM, Chiplet Clouds can be built using only +SRAMs for design points where the power and performance of memory access is +critical. The CC-MEM also includes a compression decoder module to add support +for sparse models without impacting the compute units using a +Store-as-Compressed, Load-as-Dense mechanism. + We evaluate Chiplet Cloud architectures across eight popular LLMs. Using fine +tuned Chiplet Cloud servers we are able to achieve $97\times$ and $18\times$ +improvement in TCO/Token over rented GPU and TPU clouds, or a $8.3\times$ and +$3.7\times$ improvement over fabricated GPU and TPU clouds respectively. +Chiplet Cloud can also support $1.7\times$ larger models with a sparsity of +60\%. + +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 22 + +
+
+
+ + ☆ Nearest Neighbors GParareal: Improving Scalability of Gaussian Processes + for Parallel-in-Time Solvers + + +
+ With the advent of supercomputers, multi-processor environments and +parallel-in-time (PinT) algorithms offer ways to solve initial value problems +for ordinary and partial differential equations (ODEs and PDEs) over long time +intervals, a task often unfeasible with sequential solvers within realistic +time frames. A recent approach, GParareal, combines Gaussian Processes with +traditional PinT methodology (Parareal) to achieve faster parallel speed-ups. +The method is known to outperform Parareal for low-dimensional ODEs and a +limited number of computer cores. Here, we present Nearest Neighbors GParareal +(nnGParareal), a novel data-enriched PinT integration algorithm. nnGParareal +builds upon GParareal by improving its scalability properties for +higher-dimensional systems and increased processor count. Through data +reduction, the model complexity is reduced from cubic to log-linear in the +sample size, yielding a fast and automated procedure to integrate initial value +problems over long time intervals. First, we provide both an upper bound for +the error and theoretical details on the speed-up benefits. Then, we +empirically illustrate the superior performance of nnGParareal, compared to +GParareal and Parareal, on nine different systems with unique features (e.g., +stiff, chaotic, high-dimensional, or challenging-to-learn systems). + +
+
+
+
+
+ + ☆ EdgeLoc: A Communication-Adaptive Parallel System for Real-Time + Localization in Infrastructure-Assisted Autonomous Driving + + +
+ This paper presents EdgeLoc, an infrastructure-assisted, real-time +localization system for autonomous driving that addresses the incompatibility +between traditional localization methods and deep learning approaches. The +system is built on top of the Robot Operating System (ROS) and combines the +real-time performance of traditional methods with the high accuracy of deep +learning approaches. The system leverages edge computing capabilities of +roadside units (RSUs) for precise localization to enhance on-vehicle +localization that is based on the real-time visual odometry. EdgeLoc is a +parallel processing system, utilizing a proposed uncertainty-aware pose fusion +solution. It achieves communication adaptivity through online learning and +addresses fluctuations via window-based detection. Moreover, it achieves +optimal latency and maximum improvement by utilizing auto-splitting +vehicle-infrastructure collaborative inference, as well as online distribution +learning for decision-making. Even with the most basic end-to-end deep neural +network for localization estimation, EdgeLoc realizes a 67.75\% reduction in +the localization error for real-time local visual odometry, a 29.95\% reduction +for non-real-time collaborative inference, and a 30.26\% reduction compared to +Kalman filtering. Finally, accuracy-to-latency conversion was experimentally +validated, and an overall experiment was conducted on a practical cellular +network. The system is open sourced at +https://github.com/LoganCome/EdgeAssistedLocalization. + +
+
+
+
+
+ + ☆ Strongly-Consistent Distributed Discrete-event Systems + + +
+ Discrete-event (DE) systems are concurrent programs where components +communicate via tagged events, where tags are drawn from a totally ordered set. +Reactors are an emerging model of computation based on DE and realized in the +open-source coordination language Lingua Franca. Distributed DE (DDE) systems +are DE systems where the components (reactors) communicate over networks. The +prior art has required that for DDE systems with cycles, each cycle must +contain at least one logical delay, where the tag of events is incremented. +Such delays, however, are not required by the elegant fixed-point semantics of +DE. The only requirement is that the program be constructive, meaning it is +free of causality cycles. This paper gives a way to coordinate the execution of +DDE systems that can execute any constructive program, even one with zero-delay +cycles. It provides a formal model that exposes exactly the information that +must be shared across networks for such execution to be possible. Furthermore, +it describes a concrete implementation that is an extension of the coordination +mechanisms in Lingua Franca. + +
+
+
+
+
+ + ☆ PARALLELGPUOS: A Concurrent OS-level GPU Checkpoint and Restore System + using Validated Speculation + + +
+ Checkpointing (C) and restoring (R) are key components for GPU tasks. POS is +an OS-level GPU C/R system: It can transparently checkpoint or restore +processes that use the GPU, without requiring any cooperation from the +application, a key feature required by modern systems like the cloud. Moreover, +POS is the first OS-level C/R system that can concurrently execute C/R with the +application execution: a critical feature that can be trivially achieved when +the processes only running on the CPU, but becomes challenging when the +processes use GPU. The problem is how to ensure consistency during concurrent +execution with the lack of application semantics due to transparency. CPU +processes can leverage OS and hardware paging to fix inconsistency without +application semantics. Unfortunately, GPU bypasses OS and paging for high +performance. POS fills the semantic gap by speculatively extracting buffer +access information of GPU kernels during runtime. Thanks to the simple and +well-structured nature of GPU kernels, our speculative extraction (with runtime +validation) achieves 100% accuracy on applications from training to inference +whose domains span from vision, large language models, and reinforcement +learning. Based on the extracted semantics, we systematically overlap C/R with +application execution, and achieves orders of magnitude higher performance +under various tasks compared with the state-of-the-art OS-level GPU C/R, +including training fault tolerance, live GPU process migration, and cold starts +acceleration in GPU-based serverless computing. + +
+
+
+
+
+ + ☆ Parallelization of the K-Means Algorithm with Applications to Big Data + Clustering + + +
+ The K-Means clustering using LLoyd's algorithm is an iterative approach to +partition the given dataset into K different clusters. The algorithm assigns +each point to the cluster based on the following objective function + \[\ \min \Sigma_{i=1}^{n}||x_i-\mu_{x_i}||^2\] The serial algorithm involves +iterative steps where we compute the distance of each datapoint from the +centroids and assign the datapoint to the nearest centroid. This approach is +essentially known as the expectation-maximization step. Clustering involves +extensive computations to calculate distances at each iteration, which +increases as the number of data points increases. This provides scope for +parallelism. However, we must ensure that in a parallel process, each thread +has access to the updated centroid value and no racing condition exists on any +centroid values. We will compare two different approaches in this project. The +first approach is an OpenMP flat synchronous method where all processes are run +in parallel, and we use synchronization to ensure safe updates of clusters. The +second approach we adopt is a GPU based parallelization approach using OpenACC +wherein we will try to make use of GPU architecture to parallelize chunks of +the algorithm to observe decreased computation time. We will analyze metrics +such as speed up, efficiency,time taken with varying data points, and number of +processes to compare the two approaches and understand the relative performance +improvement we can get. + +
+
+ comment: 7 Pages, 5 tables, 12 figures +
+
+
+
+
+ + ☆ Energy-Efficient Federated Edge Learning with Streaming Data: A Lyapunov + Optimization Approach + + +
+ Federated learning (FL) has received significant attention in recent years +for its advantages in efficient training of machine learning models across +distributed clients without disclosing user-sensitive data. Specifically, in +federated edge learning (FEEL) systems, the time-varying nature of wireless +channels introduces inevitable system dynamics in the communication process, +thereby affecting training latency and energy consumption. In this work, we +further consider a streaming data scenario where new training data samples are +randomly generated over time at edge devices. Our goal is to develop a dynamic +scheduling and resource allocation algorithm to address the inherent randomness +in data arrivals and resource availability under long-term energy constraints. +To achieve this, we formulate a stochastic network optimization problem and use +the Lyapunov drift-plus-penalty framework to obtain a dynamic resource +management design. Our proposed algorithm makes adaptive decisions on device +scheduling, computational capacity adjustment, and allocation of bandwidth and +transmit power in every round. We provide convergence analysis for the +considered setting with heterogeneous data and time-varying objective +functions, which supports the rationale behind our proposed scheduling design. +The effectiveness of our scheme is verified through simulation results, +demonstrating improved learning performance and energy efficiency as compared +to baseline schemes. + +
+
+ comment: Submitted to IEEE journals for possible publication +
+
+
+
+
+ + ☆ Vertical Federated Learning Hybrid Local Pre-training + + +
+ Vertical Federated Learning (VFL), which has a broad range of real-world +applications, has received much attention in both academia and industry. +Enterprises aspire to exploit more valuable features of the same users from +diverse departments to boost their model prediction skills. VFL addresses this +demand and concurrently secures individual parties from exposing their raw +data. However, conventional VFL encounters a bottleneck as it only leverages +aligned samples, whose size shrinks with more parties involved, resulting in +data scarcity and the waste of unaligned data. To address this problem, we +propose a novel VFL Hybrid Local Pre-training (VFLHLP) approach. VFLHLP first +pre-trains local networks on the local data of participating parties. Then it +utilizes these pre-trained networks to adjust the sub-model for the labeled +party or enhance representation learning for other parties during downstream +federated learning on aligned data, boosting the performance of federated +models. + +
+
+
+
+
+ + ☆ PLASMA -- Platform for Service Management in Digital Remote Maintenance + Applications + + +
+ To support maintenance and servicing of industrial machines, service +processes are even today often performed manually and analogously, although +supportive technologies such as augmented reality, virtual reality and digital +platforms already exist. In many cases, neither technicians on-site nor remote +experts have all the essential information and options for suitable actions +available. Existing service products and platforms do not cover all the +required functions in practice in order to map end-to-end processes. PLASMA is +a concept for a Cloud-based remote maintenance platform designed to meet these +demands. But for a real-life implementation of PLASMA, security measures are +essential as we show in this paper. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ FedCAda: Adaptive Client-Side Optimization for Accelerated and Stable + Federated Learning + + +
+ Federated learning (FL) has emerged as a prominent approach for collaborative +training of machine learning models across distributed clients while preserving +data privacy. However, the quest to balance acceleration and stability becomes +a significant challenge in FL, especially on the client-side. In this paper, we +introduce FedCAda, an innovative federated client adaptive algorithm designed +to tackle this challenge. FedCAda leverages the Adam algorithm to adjust the +correction process of the first moment estimate $m$ and the second moment +estimate $v$ on the client-side and aggregate adaptive algorithm parameters on +the server-side, aiming to accelerate convergence speed and communication +efficiency while ensuring stability and performance. Additionally, we +investigate several algorithms incorporating different adjustment functions. +This comparative analysis revealed that due to the limited information +contained within client models from other clients during the initial stages of +federated learning, more substantial constraints need to be imposed on the +parameters of the adaptive algorithm. As federated learning progresses and +clients gather more global information, FedCAda gradually diminishes the impact +on adaptive parameters. These findings provide insights for enhancing the +robustness and efficiency of algorithmic improvements. Through extensive +experiments on computer vision (CV) and natural language processing (NLP) +datasets, we demonstrate that FedCAda outperforms the state-of-the-art methods +in terms of adaptability, convergence, stability, and overall performance. This +work contributes to adaptive algorithms for federated learning, encouraging +further exploration. + +
+
+
+
+
+ + ☆ Securing Blockchain-based IoT Systems with Physical Unclonable Functions + and Zero-Knowledge Proofs + + +
+ This paper presents a framework for securing blockchain-based IoT systems by +integrating Physical Unclonable Functions (PUFs) and Zero-Knowledge Proofs +(ZKPs) within a Hyperledger Fabric environment. The proposed framework +leverages PUFs for unique device identification and ZKPs for privacy-preserving +authentication and transaction processing. Experimental results demonstrate the +framework's feasibility, performance, and security against various attacks. +This framework provides a comprehensive solution for addressing the security +challenges in blockchain-based IoT systems. + +
+
+
+
+
+ + ☆ Cost-Optimal Microservices Deployment with Cluster Autoscaling and Spot + Pricing + + +
+ Microservices architecture has been established as an ideal software +architecture for cloud-based software development and deployment, offering many +benefits such as agility and efficiency. Microservices are often associated +with containers and container orchestration systems for deployment, as +containerization provides convenient tools and techniques for resource +management, including the automation of orchestration processes. Among the +factors that make the cloud suitable for commercial software deployment, +transient pricing options like AWS Spot Pricing are particularly attractive as +they allow consumers to significantly reduce cloud costs. However, the dynamic +nature of resource demand and the abrupt termination of spot VMs make transient +pricing challenging. Nonetheless, containerization and container orchestration +systems open new avenues to optimize the cost of microservices deployments by +leveraging spot pricing on the public cloud while achieving application and +business goals. + We propose SpotKube, an open-source, Kubernetes-based, application-aware, +genetic algorithm-based solution for cost optimization, which autoscales +clusters for microservices-based applications hosted on public clouds with spot +pricing options. SpotKube analyzes application characteristics and recommends +the optimal configuration for resource allocation to the cluster. It consists +of an elastic cluster autoscaler powered by an optimization algorithm that +ensures cost-effective microservices deployment while meeting application +performance requirements and handling abrupt termination of nodes, thereby +minimizing the impact on system availability. We implement and evaluate +SpotKube with representative microservices-based applications in a real public +cloud setup, demonstrating the effectiveness of our approach against +alternative optimization strategies. + +
+
+ comment: 11 pages including references, 11 figures, Keywords: Microservice, + Cost optimization, Cluster Autoscaling, Transient Pricing +
+
+
+
+
+ + ☆ Practical Performance of a Distributed Processing Framework for + Machine-Learning-based NIDS + + +
+ Network Intrusion Detection Systems (NIDSs) detect intrusion attacks in +network traffic. In particular, machine-learning-based NIDSs have attracted +attention because of their high detection rates of unknown attacks. A +distributed processing framework for machine-learning-based NIDSs employing a +scalable distributed stream processing system has been proposed in the +literature. However, its performance, when machine-learning-based classifiers +are implemented has not been comprehensively evaluated. In this study, we +implement five representative classifiers (Decision Tree, Random Forest, Naive +Bayes, SVM, and kNN) based on this framework and evaluate their throughput and +latency. By conducting the experimental measurements, we investigate the +difference in the processing performance among these classifiers and the +bottlenecks in the processing performance of the framework. + +
+
+ comment: This paper was accepted at the 14th IEEE International Workshop on + Network Technologies for Security, Administration & Protection (NETSAP 2024) +
+
+
+
+
+ + ☆ StatAvg: Mitigating Data Heterogeneity in Federated Learning for + Intrusion Detection Systems + + +
+ Federated learning (FL) is a decentralized learning technique that enables +participating devices to collaboratively build a shared Machine Leaning (ML) or +Deep Learning (DL) model without revealing their raw data to a third party. Due +to its privacy-preserving nature, FL has sparked widespread attention for +building Intrusion Detection Systems (IDS) within the realm of cybersecurity. +However, the data heterogeneity across participating domains and entities +presents significant challenges for the reliable implementation of an FL-based +IDS. In this paper, we propose an effective method called Statistical Averaging +(StatAvg) to alleviate non-independently and identically (non-iid) distributed +features across local clients' data in FL. In particular, StatAvg allows the FL +clients to share their individual data statistics with the server, which then +aggregates this information to produce global statistics. The latter are shared +with the clients and used for universal data normalisation. It is worth +mentioning that StatAvg can seamlessly integrate with any FL aggregation +strategy, as it occurs before the actual FL training process. The proposed +method is evaluated against baseline approaches using datasets for network and +host Artificial Intelligence (AI)-powered IDS. The experimental results +demonstrate the efficiency of StatAvg in mitigating non-iid feature +distributions across the FL clients compared to the baseline methods. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ How Does Stake Distribution Influence Consensus? Analyzing Blockchain + Decentralization + + +
+ In the PoS blockchain landscape, the challenge of achieving full +decentralization is often hindered by a disproportionate concentration of +staked tokens among a few validators. This study analyses this challenge by +first formalizing decentralization metrics for weighted consensus mechanisms. +An empirical analysis across ten permissionless blockchains uncovers +significant weight concentration among validators, underscoring the need for an +equitable approach. To counter this, we introduce the Square Root Stake Weight +(SRSW) model, which effectively recalibrates staking weight distribution. Our +examination of the SRSW model demonstrates notable improvements in the +decentralization metrics: the Gini index improves by 37.16% on average, while +Nakamoto coefficients for liveness and safety see mean enhancements of 101.04% +and 80.09%, respectively. This research is a pivotal step toward a more fair +and equitable distribution of staking weight, advancing the decentralization in +blockchain consensus mechanisms. + +
+
+ comment: To appear in ICBC 2024 +
+
+
+
+
+ + ♻ ☆ Splitwise: Efficient generative LLM inference using phase splitting + + +
+ Recent innovations in generative large language models (LLMs) have made their +applications and use-cases ubiquitous. This has led to large-scale deployments +of these models, using complex, expensive, and power-hungry AI accelerators, +most commonly GPUs. These developments make LLM inference efficiency an +important challenge. Based on our extensive characterization, we find that +there are two main phases during an LLM inference request: a compute-intensive +prompt computation, and a memory-intensive token generation, each with distinct +latency, throughput, memory, and power characteristics. Despite +state-of-the-art batching and scheduling, the token generation phase +underutilizes compute resources. Specifically, unlike compute-intensive prompt +computation phases, token generation phases do not require the compute +capability of the latest GPUs, and can be run with lower power and cost. + With Splitwise, we propose splitting the two phases of a LLM inference +request on to separate machines. This allows us to use hardware that is +well-suited for each phase, and provision resources independently per phase. +However, splitting an inference request across machines requires state transfer +from the machine running prompt computation over to the machine generating +tokens. We implement and optimize this state transfer using the fast back-plane +interconnects available in today's GPU clusters. + We use the Splitwise technique to design LLM inference clusters using the +same or different types of machines for the prompt computation and token +generation phases. Our clusters are optimized for three key objectives: +throughput, cost, and power. In particular, we show that we can achieve 1.4x +higher throughput at 20% lower cost than current designs. Alternatively, we can +achieve 2.35x more throughput with the same cost and power budgets. + +
+
+ comment: 12 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ On the Communication Complexity of Decentralized Bilevel Optimization + + +
+ Decentralized bilevel optimization has been actively studied in the past few +years since it has widespread applications in machine learning. However, +existing algorithms suffer from large communication complexity caused by the +estimation of stochastic hypergradient, limiting their application to +real-world tasks. To address this issue, we develop a novel decentralized +stochastic bilevel gradient descent algorithm under the heterogeneous setting, +which enjoys a small communication cost in each round and a small number of +communication rounds. As such, it can achieve a much better communication +complexity than existing algorithms without any strong assumptions regarding +heterogeneity. To the best of our knowledge, this is the first stochastic +algorithm achieving these theoretical results under the heterogeneous setting. +At last, the experimental results confirm the efficacy of our algorithm. + +
+
+
+
+
+ + ♻ ☆ Pollen: High-throughput Federated Learning Simulation via Resource-Aware + Client Placement + + +
+ Federated Learning (FL) is a privacy-focused machine learning paradigm that +collaboratively trains models directly on edge devices. Simulation plays an +essential role in FL adoption, helping develop novel aggregation and client +sampling strategies. However, current simulators cannot emulate large-scale +systems in a time-efficient manner, which limits their utility and casts doubts +on generalizability. + This work proposes Pollen, a novel resource-aware system for speeding up +simulations. Pollen addresses two limiting factors from existing simulators: +(a) communication inefficiency derived from pull-based client execution and (b) +inadequate load balance when using heterogeneous hardware. Pollen executes +high-throughput FL simulations at scale by (a) using a push-based client +placement system, (b) learning how an adaptable scheduling of clients based on +hardware statistics (c) estimating the optimal number of concurrent workers per +GPU. We evaluate Pollen on four representative FL tasks and show that Pollen's +placement model increases GPU utilization and reduces idle time. We compare +Pollen to Flower, Flute, FedScale, Parrot, and pfl and show experimental +speed-ups of days or weeks. + +
+
+ comment: 22 pages, 22 figures, 9 tables, under review +
+
+
+
+
+ + ♻ ☆ CoRaiS: Lightweight Real-Time Scheduler for Multi-Edge Cooperative + Computing + + +
+ Multi-edge cooperative computing that combines constrained resources of +multiple edges into a powerful resource pool has the potential to deliver great +benefits, such as a tremendous computing power, improved response time, more +diversified services. However, the mass heterogeneous resources composition and +lack of scheduling strategies make the modeling and cooperating of multi-edge +computing system particularly complicated. This paper first proposes a +system-level state evaluation model to shield the complex hardware +configurations and redefine the different service capabilities at heterogeneous +edges. Secondly, an integer linear programming model is designed to cater for +optimally dispatching the distributed arriving requests. Finally, a +learning-based lightweight real-time scheduler, CoRaiS, is proposed. CoRaiS +embeds the real-time states of multi-edge system and requests information, and +combines the embeddings with a policy network to schedule the requests, so that +the response time of all requests can be minimized. Evaluation results verify +that CoRaiS can make a high-quality scheduling decision in real time, and can +be generalized to other multi-edge computing system, regardless of system +scales. Characteristic validation also demonstrates that CoRaiS successfully +learns to balance loads, perceive real-time state and recognize heterogeneity +while scheduling. + +
+
+ comment: Accepted by IEEE Internet of Things Journal +
+
+
+
+
+ + ♻ ☆ Blockchain based Secure Energy Marketplace Scheme to Motivate Peer to + Peer Microgrids + + +
+ In the past years trend of microgrids is increasing very fast to reduce +peak-hour costs. However, in these systems, third parties are still involved in +selling surplus energy. This results in increased cost of energy and there are +many operational and security barriers in such systems. These issues can be +solved by the decentralized distributed system of microgrids where a consumer +can locally sell their surplus energy to another consumer. To deploy such a +system, one must consider security barriers for the transaction of energy. This +paper proposes a solution to these problems by devising a scheme as a +marketplace where users interact with each other to buy and sell energy at +better rates and get energy-generating resources on lease so that users do not +have to worry about capital investment. Agreement between owner of resources +and consumer is recorded on blockchain based smart contracts. In this paper, a +survey is performed for existing well known, decentralized energy solutions. +This paper also proposes an extra layer of security to leverage a shielded +execution environment so that information of energy generated, utilized, and +shared cannot be changed by consumers and third parties even if the system is +compromised. + +
+
+
+
+
+ + ♻ ☆ Collaborative Satellite Computing through Adaptive DNN Task Splitting + and Offloading SC + + +
+ Satellite computing has emerged as a promising technology for next-generation +wireless networks. This innovative technology provides data processing +capabilities, which facilitates the widespread implementation of artificial +intelligence (AI)-based applications, especially for image processing tasks +involving deep neural network (DNN). With the limited computing resources of an +individual satellite, independently handling DNN tasks generated by diverse +user equipments (UEs) becomes a significant challenge. One viable solution is +dividing a DNN task into multiple subtasks and subsequently distributing them +across multiple satellites for collaborative computing. However, it is +challenging to partition DNN appropriately and allocate subtasks into suitable +satellites while ensuring load balancing. To this end, we propose a +collaborative satellite computing system designed to improve task processing +efficiency in satellite networks. Based on this system, a workload-balanced +adaptive task splitting scheme is developed to equitably distribute the +workload of DNN slices for collaborative inference, consequently enhancing the +utilization of satellite computing resources. Additionally, a self-adaptive +task offloading scheme based on a genetic algorithm (GA) is introduced to +determine optimal offloading decisions within dynamic network environments. The +numerical results illustrate that our proposal can outperform comparable +methods in terms of task completion rate, delay, and resource utilization. + +
+
+ comment: Accepted by 29th IEEE Symposium on Computers and Communications + (ISCC) +
+
+
+
+
+ + ♻ ☆ Online Load and Graph Balancing for Random Order Inputs + + +
+ Online load balancing for heterogeneous machines aims to minimize the +makespan (maximum machine workload) by scheduling arriving jobs with varying +sizes on different machines. In the adversarial setting, where an adversary +chooses not only the collection of job sizes but also their arrival order, the +problem is well-understood and the optimal competitive ratio is known to be +$\Theta(\log m)$ where $m$ is the number of machines. In the more realistic +random arrival order model, the understanding is limited. Previously, the best +lower bound on the competitive ratio was only $\Omega(\log \log m)$. + We significantly improve this bound by showing an $\Omega( \sqrt {\log m})$ +lower bound, even for the restricted case where each job has a unit size on two +machines and infinite size on the others. On the positive side, we propose an +$O(\log m/\log \log m)$-competitive algorithm, demonstrating that better +performance is possible in the random arrival model. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Training of Deep Neural Networks Using Early Exiting + + +
+ Deep neural networks provide state-of-the-art accuracy for vision tasks but +they require significant resources for training. Thus, they are trained on +cloud servers far from the edge devices that acquire the data. This issue +increases communication cost, runtime and privacy concerns. In this study, a +novel hierarchical training method for deep neural networks is proposed that +uses early exits in a divided architecture between edge and cloud workers to +reduce the communication cost, training runtime and privacy concerns. The +method proposes a brand-new use case for early exits to separate the backward +pass of neural networks between the edge and the cloud during the training +phase. We address the issues of most available methods that due to the +sequential nature of the training phase, cannot train the levels of hierarchy +simultaneously or they do it with the cost of compromising privacy. In +contrast, our method can use both edge and cloud workers simultaneously, does +not share the raw input data with the cloud and does not require communication +during the backward pass. Several simulations and on-device experiments for +different neural network architectures demonstrate the effectiveness of this +method. It is shown that the proposed method reduces the training runtime for +VGG-16 and ResNet-18 architectures by 29% and 61% in CIFAR-10 classification +and by 25% and 81% in Tiny ImageNet classification when the communication with +the cloud is done over a low bit rate channel. This gain in the runtime is +achieved whilst the accuracy drop is negligible. This method is advantageous +for online learning of high-accuracy deep neural networks on sensor-holding +low-resource devices such as mobile phones or robots as a part of an edge-cloud +system, making them more flexible in facing new tasks and classes of data. + +
+
+ comment: Accepted to IEEE Transactions on Neural Networks and Learning Systems + (2024), 15 pages, 10 figures, 3 Tables +
+
+
+
+
+
+
+
+ + Programming and Languages 2 + +
+
+
+ + ♻ ☆ Scoped Effects as Parameterized Algebraic Theories + + +
+ Notions of computation can be modelled by monads. Algebraic effects offer a +characterization of monads in terms of algebraic operations and equational +axioms, where operations are basic programming features, such as reading or +updating the state, and axioms specify observably equivalent expressions. +However, many useful programming features depend on additional mechanisms such +as delimited scopes or dynamically allocated resources. Such mechanisms can be +supported via extensions to algebraic effects including scoped effects and +parameterized algebraic theories. We present a fresh perspective on scoped +effects by translation into a variation of parameterized algebraic theories. +The translation enables a new approach to equational reasoning for scoped +effects and gives rise to an alternative characterization of monads in terms of +generators and equations involving both scoped and algebraic operations. We +demonstrate the power of our fresh perspective by way of equational +characterizations of several known models of scoped effects. + +
+
+ comment: Extended version of the ESOP 2024 paper with the same title +
+
+
+
+
+ + ♻ ☆ Fair Asynchronous Session Subtyping + + +
+ Session types are widely used as abstractions of asynchronous message passing +systems. Refinement for such abstractions is crucial as it allows improvements +of a given component without compromising its compatibility with the rest of +the system. In the context of session types, the most general notion of +refinement is asynchronous session subtyping, which allows message emissions to +be anticipated w.r.t. a bounded amount of message consumptions. In this paper +we investigate the possibility to anticipate emissions w.r.t. an unbounded +amount of consumptions: to this aim we propose to consider fair compliance over +asynchronous session types and fair refinement as the relation that preserves +it. This allows us to propose a novel variant of session subtyping that +leverages the notion of controllability from service contract theory and that +is a sound characterisation of fair refinement. In addition, we show that both +fair refinement and our novel subtyping are undecidable. We also present a +sound algorithm which deals with examples that feature potentially unbounded +buffering. Finally, we present an implementation of our algorithm and an +empirical evaluation of it on synthetic benchmarks. + +
+
+
+
+
+
+
+
+ + Performance Profiling 2 + +
+
+
+ + ☆ Count-Min Sketch with Conservative Updates: Worst-Case Analysis + + +
+ Count-Min Sketch with Conservative Updates (\texttt{CMS-CU}) is a +memory-efficient hash-based data structure used to estimate the occurrences of +items within a data stream. \texttt{CMS-CU} stores~$m$ counters and employs~$d$ +hash functions to map items to these counters. We first argue that the +estimation error in \texttt{CMS-CU} is maximal when each item appears at most +once in the stream. Next, we study \texttt{CMS-CU} in this setting. Precisely, +\begin{enumerate} + \item In the case where~$d=m-1$, we prove that the average estimation error +and the average counter rate converge almost surely to~$\frac{1}{2}$, +contrasting with the vanilla Count-Min Sketch, where the average counter rate +is equal to~$\frac{m-1}{m}$. + \item For any given~$m$ and~$d$, we prove novel lower and upper bounds on the +average estimation error, incorporating a positive integer parameter~$g$. +Larger values of this parameter improve the accuracy of the bounds. Moreover, +the computation of each bound involves examining an ergodic Markov process with +a state space of size~$\binom{m+g-d}{g}$ and a sparse transition probabilities +matrix containing~$\mathcal{O}(m\binom{m+g-d}{g})$ non-zero entries. + \item For~$d=m-1$, $g=1$, and as $m\to \infty$, we show that the lower and +upper bounds coincide. In general, our bounds exhibit high accuracy for small +values of $g$, as shown by numerical computation. For example, for $m=50$, +$d=4$, and $g=5$, the difference between the lower and upper bounds is smaller +than~$10^{-4}$. + \end{enumerate} + +
+
+
+
+
+ + ☆ Response time in a pair of processor sharing queues with + Join-the-Shortest-Queue scheduling + + +
+ Join-the-Shortest-Queue (JSQ) is the scheduling policy of choice for many +network providers, cloud servers and traffic management systems, where +individual queues are served under processor sharing (PS) queueing discipline. +A numerical solution for the response time distribution in two parallel PS +queues with JSQ scheduling is derived for the first time. Using the generating +function method, two partial differential equations (PDEs) are obtained +corresponding to conditional response times, where the conditioning is on a +particular traced task joining the first or the second queue. These PDEs are +functional equations that contain partial generating functions and their +partial derivatives, and therefore cannot be solved by commonly used +techniques. We are able to solve these PDEs numerically with good accuracy and +perform the deconditioning with respect to the queue-length probabilities by +evaluating a certain complex integral. Numerical results for the density and +the first four moments compare well against regenerative simulation with +500,000 regeneration cycles. + +
+
+
+
+
+
+
+
+ + Operation Systems 1 + +
+
+
+ + ☆ PARALLELGPUOS: A Concurrent OS-level GPU Checkpoint and Restore System + using Validated Speculation + + +
+ Checkpointing (C) and restoring (R) are key components for GPU tasks. POS is +an OS-level GPU C/R system: It can transparently checkpoint or restore +processes that use the GPU, without requiring any cooperation from the +application, a key feature required by modern systems like the cloud. Moreover, +POS is the first OS-level C/R system that can concurrently execute C/R with the +application execution: a critical feature that can be trivially achieved when +the processes only running on the CPU, but becomes challenging when the +processes use GPU. The problem is how to ensure consistency during concurrent +execution with the lack of application semantics due to transparency. CPU +processes can leverage OS and hardware paging to fix inconsistency without +application semantics. Unfortunately, GPU bypasses OS and paging for high +performance. POS fills the semantic gap by speculatively extracting buffer +access information of GPU kernels during runtime. Thanks to the simple and +well-structured nature of GPU kernels, our speculative extraction (with runtime +validation) achieves 100% accuracy on applications from training to inference +whose domains span from vision, large language models, and reinforcement +learning. Based on the extracted semantics, we systematically overlap C/R with +application execution, and achieves orders of magnitude higher performance +under various tasks compared with the state-of-the-art OS-level GPU C/R, +including training fault tolerance, live GPU process migration, and cold starts +acceleration in GPU-based serverless computing. + +
+
+
+
+
+
+
+
+ + Computational Complexity 4 + +
+
+
+ + ☆ Noise-tolerant learnability of shallow quantum circuits from statistics + and the cost of quantum pseudorandomness + + +
+ This work studies the learnability of unknown quantum circuits in the near +term. We prove the natural robustness of quantum statistical queries for +learning quantum processes and provide an efficient way to benchmark various +classes of noise from statistics, which gives us a powerful framework for +developing noise-tolerant algorithms. We adapt a learning algorithm for +constant-depth quantum circuits to the quantum statistical query setting with a +small overhead in the query complexity. We prove average-case lower bounds for +learning random quantum circuits of logarithmic and higher depths within +diamond distance with statistical queries. Additionally, we show the hardness +of the quantum threshold search problem from quantum statistical queries and +discuss its implications for the learnability of shallow quantum circuits. +Finally, we prove that pseudorandom unitaries (PRUs) cannot be constructed +using circuits of constant depth by constructing an efficient distinguisher and +proving a new variation of the quantum no-free lunch theorem. + +
+
+ comment: 23+7 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Solving promise equations over monoids and groups + + +
+ We give a complete complexity classification for the problem of finding a +solution to a given system of equations over a fixed finite monoid, given that +a solution over a more restricted monoid exists. As a corollary, we obtain a +complexity classification for the same problem over groups. + +
+
+ comment: Full version of an ICALP 2024 paper +
+
+
+
+
+ + ♻ ☆ Partial gathering of mobile agents in dynamic rings + + +
+ In this paper, we consider the partial gathering problem of mobile agents in +synchronous dynamic bidirectional ring networks. When k agents are distributed +in the network, the partial gathering problem requires, for a given positive +integer g (< k), that agents terminate in a configuration such that either at +least g agents or no agent exists at each node. So far, the partial gathering +problem has been considered in static graphs. In this paper, we start +considering partial gathering in dynamic graphs. As a first step, we consider +this problem in 1-interval connected rings, that is, one of the links in a ring +may be missing at each time step. In such networks, focusing on the +relationship between the values of k and g, we fully characterize the +solvability of the partial gathering problem and analyze the move complexity of +the proposed algorithms when the problem can be solved. First, we show that the +g-partial gathering problem is unsolvable when k <= 2g. Second, we show that +the problem can be solved with O(n log g) time and the total number of O(gn log +g) moves when 2g + 1 <= k <= 3g - 2. Third, we show that the problem can be +solved with O(n) time and the total number of O(kn) moves when 3g - 1 <= k <= +8g - 4. Notice that since k = O(g) holds when 3g - 1 <= k <= 8g - 4, the move +complexity O(kn) in this case can be represented also as O(gn). Finally, we +show that the problem can be solved with O(n) time and the total number of +O(gn) moves when k >= 8g - 3. These results mean that the partial gathering +problem can be solved also in dynamic rings when k >= 2g + 1. In addition, +agents require a total number of \Omega(gn) moves to solve the partial (resp., +total) gathering problem. Thus, when k >= 3g - 1, agents can solve the partial +gathering problem with the asymptotically optimal total number of O(gn) moves. + +
+
+
+
+
+ + ♻ ☆ Computational Lower Bounds for Graphon Estimation via Low-degree + Polynomials + + +
+ Graphon estimation has been one of the most fundamental problems in network +analysis and has received considerable attention in the past decade. From the +statistical perspective, the minimax error rate of graphon estimation has been +established by Gao et al (2015) for both stochastic block model and +nonparametric graphon estimation. The statistical optimal estimators are based +on constrained least squares and have computational complexity exponential in +the dimension. From the computational perspective, the best-known +polynomial-time estimator is based universal singular value thresholding, but +it can only achieve a much slower estimation error rate than the minimax one. +The computational optimality of the USVT or the existence of a computational +barrier in graphon estimation has been a long-standing open problem. In this +work, we provide rigorous evidence for the computational barrier in graphon +estimation via low-degree polynomials. Specifically, in SBM graphon estimation, +we show that for low-degree polynomial estimators, their estimation error rates +cannot be significantly better than that of the USVT under a wide range of +parameter regimes and in nonparametric graphon estimation, we show low-degree +polynomial estimators achieve estimation error rates strictly slower than the +minimax rate. Our results are proved based on the recent development of +low-degree polynomials by Schramm and Wein (2022), while we overcome a few key +challenges in applying it to the general graphon estimation problem. By +leveraging our main results, we also provide a computational lower bound on the +clustering error for community detection in SBM with a growing number of +communities and this yields a new piece of evidence for the conjectured +Kesten-Stigum threshold for efficient community recovery. Finally, we extend +our computational lower bounds to sparse graphon estimation and biclustering. + +
+
+ comment: Add low-degree upper bound in v2 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Distributed, Parallel, and Cluster Computing 8 + +
+
+
+ + ☆ The Limits and Potentials of Local SGD for Distributed Heterogeneous + Learning with Intermittent Communication + + +
+ Local SGD is a popular optimization method in distributed learning, often +outperforming other algorithms in practice, including mini-batch SGD. Despite +this success, theoretically proving the dominance of local SGD in settings with +reasonable data heterogeneity has been difficult, creating a significant gap +between theory and practice. In this paper, we provide new lower bounds for +local SGD under existing first-order data heterogeneity assumptions, showing +that these assumptions are insufficient to prove the effectiveness of local +update steps. Furthermore, under these same assumptions, we demonstrate the +min-max optimality of accelerated mini-batch SGD, which fully resolves our +understanding of distributed optimization for several problem classes. Our +results emphasize the need for better models of data heterogeneity to +understand the effectiveness of local SGD in practice. Towards this end, we +consider higher-order smoothness and heterogeneity assumptions, providing new +upper bounds that imply the dominance of local SGD over mini-batch SGD when +data heterogeneity is low. + +
+
+
+
+
+ + ☆ A Starting Point for Dynamic Community Detection with Leiden Algorithm + + +
+ Many real-world graphs evolve with time. Identifying communities or clusters +on such graphs is an important problem. In this technical report, we extend +three dynamic approaches, namely, Naive-dynamic (ND), Delta-screening (DS), and +Dynamic Frontier (DF), to our multicore implementation of the Leiden algorithm, +an algorithm known for its high-quality community detection. Our experiments on +a server with a 64-core AMD EPYC-7742 processor demonstrate that ND, DS, and DF +Leiden achieve speedups of 1.25x, 1.24x, and 1.37x on large graphs with random +batch updates, compared to Static, ND, and DS Leiden, respectively. However, on +real-world dynamic graphs, ND Leiden performs the best, being on average 1.14x +faster than Static Leiden. We hope our early results serve as a starting point +for dynamic approaches to the Leiden algorithm on evolving graphs. + +
+
+ comment: 13 pages, 5 figures, 2 tables. arXiv admin note: substantial text + overlap with arXiv:2404.19634 +
+
+
+
+
+ + ☆ Full private delegated quantum computing tailored from user to industry + + +
+ In this paper, we present a set of private and secure delegated quantum +computing protocols and techniques tailored to user-level and industry-level +use cases, depending on the computational resources available to the client, +the specific privacy needs required, and the type of algorithm. Our protocols +are presented at a high level as they are independent of the particular +algorithm used for such encryption and decryption processes. Additionally, we +propose a method to verify the correct execution of operations by the external +server. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ Securing Health Data on the Blockchain: A Differential Privacy and + Federated Learning Framework + + +
+ This study proposes a framework to enhance privacy in Blockchain-based +Internet of Things (BIoT) systems used in the healthcare sector. The framework +addresses the challenge of leveraging health data for analytics while +protecting patient privacy. To achieve this, the study integrates Differential +Privacy (DP) with Federated Learning (FL) to protect sensitive health data +collected by IoT nodes. The proposed framework utilizes dynamic personalization +and adaptive noise distribution strategies to balance privacy and data utility. +Additionally, blockchain technology ensures secure and transparent aggregation +and storage of model updates. Experimental results on the SVHN dataset +demonstrate that the proposed framework achieves strong privacy guarantees +against various attack scenarios while maintaining high accuracy in health +analytics tasks. For 15 rounds of federated learning with an epsilon value of +8.0, the model obtains an accuracy of 64.50%. The blockchain integration, +utilizing Ethereum, Ganache, Web3.py, and IPFS, exhibits an average transaction +latency of around 6 seconds and consistent gas consumption across rounds, +validating the practicality and feasibility of the proposed approach. + +
+
+
+
+
+ + ☆ A GAN-Based Data Poisoning Attack Against Federated Learning Systems and + Its Countermeasure + + +
+ As a distributed machine learning paradigm, federated learning (FL) is +collaboratively carried out on privately owned datasets but without direct data +access. Although the original intention is to allay data privacy concerns, +"available but not visible" data in FL potentially brings new security threats, +particularly poisoning attacks that target such "not visible" local data. +Initial attempts have been made to conduct data poisoning attacks against FL +systems, but cannot be fully successful due to their high chance of causing +statistical anomalies. To unleash the potential for truly "invisible" attacks +and build a more deterrent threat model, in this paper, a new data poisoning +attack model named VagueGAN is proposed, which can generate seemingly +legitimate but noisy poisoned data by untraditionally taking advantage of +generative adversarial network (GAN) variants. Capable of manipulating the +quality of poisoned data on demand, VagueGAN enables to trade-off attack +effectiveness and stealthiness. Furthermore, a cost-effective countermeasure +named Model Consistency-Based Defense (MCD) is proposed to identify +GAN-poisoned data or models after finding out the consistency of GAN outputs. +Extensive experiments on multiple datasets indicate that our attack method is +generally much more stealthy as well as more effective in degrading FL +performance with low complexity. Our defense method is also shown to be more +competent in identifying GAN-poisoned data or models. The source codes are +publicly available at +\href{https://github.com/SSssWEIssSS/VagueGAN-Data-Poisoning-Attack-and-Its-Countermeasure}{https://github.com/SSssWEIssSS/VagueGAN-Data-Poisoning-Attack-and-Its-Countermeasure}. + +
+
+ comment: 18 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ DF Louvain: Fast Incrementally Expanding Approach for Community + Detection on Dynamic Graphs + + +
+ Community detection is the problem of recognizing natural divisions in +networks. A relevant challenge in this problem is to find communities on +rapidly evolving graphs. In this report we present our Parallel Dynamic +Frontier (DF) Louvain algorithm, which given a batch update of edge deletions +and insertions, incrementally identifies and processes an approximate set of +affected vertices in the graph with minimal overhead, while using a novel +approach of incrementally updating weighted-degrees of vertices and total edge +weights of communities. We also present our parallel implementations of +Naive-dynamic (ND) and Delta-screening (DS) Louvain. On a server with a 64-core +AMD EPYC-7742 processor, our experiments show that DF Louvain obtains speedups +of 179x, 7.2x, and 5.3x on real-world dynamic graphs, compared to Static, ND, +and DS Louvain, respectively, and is 183x, 13.8x, and 8.7x faster, +respectively, on large graphs with random batch updates. Moreover, DF Louvain +improves its performance by 1.6x for every doubling of threads. + +
+
+ comment: 22 pages, 15 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Game Dynamics and Equilibrium Computation in the Population Protocol + Model + + +
+ We initiate the study of game dynamics in the population protocol model: $n$ +agents each maintain a current local strategy and interact in pairs uniformly +at random. Upon each interaction, the agents play a two-person game and receive +a payoff from an underlying utility function, and they can subsequently update +their strategies according to a fixed local algorithm. In this setting, we ask +how the distribution over agent strategies evolves over a sequence of +interactions, and we introduce a new distributional equilibrium concept to +quantify the quality of such distributions. As an initial example, we study a +class of repeated prisoner's dilemma games, and we consider a family of simple +local update algorithms that yield non-trivial dynamics over the distribution +of agent strategies. We show that these dynamics are related to a new class of +high-dimensional Ehrenfest random walks, and we derive exact characterizations +of their stationary distributions, bounds on their mixing times, and prove +their convergence to approximate distributional equilibria. Our results +highlight trade-offs between the local state space of each agent, and the +convergence rate and approximation factor of the underlying dynamics. Our +approach opens the door towards the further characterization of equilibrium +computation for other classes of games and dynamics in the population setting. + +
+
+ comment: To appear in PODC 2024 +
+
+
+
+
+ + ♻ ☆ Ephemeral Rollups are All you Need + + +
+ In the realm of open and composable gaming, we envision platforms where users +actively expand, create, engage, and immerse themselves in a rich world of +entertainment. One promising avenue for achieving this vision is through fully +on-chain (FOC) games, where both game state and logic reside on the blockchain, +maximizing composability. However, we must grapple with inherent limitations +and trade-offs, particularly in terms of costs and scalability. This paper +proposes BOLT, a framework that leverages the Solana Virtual Machine (SVM) to +scale FOC games without state fragmentation or compromised trust assumptions. +The framework introduces a systematic approach for discovering, utilizing, and +publishing modular pieces of logic as components deeply rooted in the +Entity-Component-System (ECS) pattern. To enhance scalability and resource +optimization, we introduce the concept of Ephemeral Rollups (ERs) that overcome +the tradeoffs of L2s horizontal scaling. These dedicated runtimes can be +customized to provide higher operational speed, configurable ticking +mechanisms, provable sessions and gasless transactions without +composability-scalability tradeoffs. + +
+
+
+
+
+
+
+
+ + Programming and Languages 2 + +
+
+
+ + ☆ Proving Functional Program Equivalence via Directed Lemma Synthesis + + +
+ Proving equivalence between functional programs is a fundamental problem in +program verification, which often amounts to reasoning about algebraic data +types (ADTs) and compositions of structural recursions. Modern theorem provers +address this problem by applying structural induction, which is insufficient +for proving many equivalence theorems. In such cases, one has to invent a set +of lemmas, prove these lemmas by additional induction, and use these lemmas to +prove the original theorem. There is, however, a lack of systematic +understanding of what lemmas are needed for inductive proofs and how these +lemmas can be synthesized automatically. This paper presents directed lemma +synthesis, an effective approach to automating equivalence proofs by +discovering critical lemmas using program synthesis techniques. We first +identify two induction-friendly forms of propositions that give formal +guarantees to the progress of the proof. We then propose two tactics that +synthesize and apply lemmas, thereby transforming the proof goal into +induction-friendly forms. Both tactics reduce lemma synthesis to a specialized +class of program synthesis problems with efficient algorithms. Experimental +results demonstrate the effectiveness of our approach: Compared to +state-of-the-art equivalence checkers employing heuristic-based lemma +enumeration, directed lemma synthesis saves 95.47% runtime on average and +solves 38 more tasks over an extended version of the standard benchmark set. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ Hal: A Language-General Framework for Analysis of User-Specified + Monotone Frameworks [DRAFT] + + +
+ Writing dataflow analyzers requires both language and domain-specificity. +That is to say, each programming language and each program property requires +its own analyzer. To enable a streamlined, user-driven approach to dataflow +analyzers, we introduce the theoretical framework for a user-specified dataflow +analysis. This framework is constructed in such a way that the user has to +specify as little as possible, while the analyzer infers and computes +everything else, including interprocedural embellishments. This theoretical +framework was also implemented in Java, where users can specify a program +property alongside minimal extra information to induce a dataflow analysis. +This framework (both theoretical and in implementation) is language-general, +meaning that it is independent of syntax and semantics (as all necessary +syntactic and semantic information is provided by the user, and this +information is provided only once for a given language). In this paper, we +introduce basic notions of intraprocedural and interprocedural dataflow +analyses, the proposed "Implicit Monotone Framework," and a rigorous framework +for partial functions as a property space. + +
+
+ comment: Undergraduate Senior Capstone Project +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 2 + +
+
+
+ + ☆ On the Expressivity of Recurrent Neural Cascades with Identity + + +
+ Recurrent Neural Cascades (RNC) are the class of recurrent neural networks +with no cyclic dependencies among recurrent neurons. Their subclass RNC+ with +positive recurrent weights has been shown to be closely connected to the +star-free regular languages, which are the expressivity of many +well-established temporal logics. The existing expressivity results show that +the regular languages captured by RNC+ are the star-free ones, and they leave +open the possibility that RNC+ may capture languages beyond regular. We exclude +this possibility for languages that include an identity element, i.e., an input +that can occur an arbitrary number of times without affecting the output. +Namely, in the presence of an identity element, we show that the languages +captured by RNC+ are exactly the star-free regular languages. Identity elements +are ubiquitous in temporal patterns, and hence our results apply to a large +number of applications. The implications of our results go beyond expressivity. +At their core, we establish a close structural correspondence between RNC+ and +semiautomata cascades, showing that every neuron can be equivalently captured +by a three-state semiautomaton. A notable consequence of this result is that +RNC+ are no more succinct than cascades of three-state semiautomata. + +
+
+
+
+
+ + ♻ ☆ Computing Minimal Absent Words and Extended Bispecial Factors with CDAWG + Space + + +
+ A string $w$ is said to be a minimal absent word (MAW) for a string $S$ if +$w$ does not occur in $S$ and any proper substring of $w$ occurs in $S$. We +focus on non-trivial MAWs which are of length at least 2. Finding such +non-trivial MAWs for a given string is motivated for applications in +bioinformatics and data compression. Fujishige et al. [TCS 2023] proposed a +data structure of size $\Theta(n)$ that can output the set $\mathsf{MAW}(S)$ of +all MAWs for a given string $S$ of length $n$ in $O(n + |\mathsf{MAW}(S)|)$ +time, based on the directed acyclic word graph (DAWG). In this paper, we +present a more space efficient data structure based on the compact DAWG +(CDAWG), which can output $\mathsf{MAW}(S)$ in $O(|\mathsf{MAW}(S)|)$ time with +$O(\mathsf{e}_\min)$ space, where $\mathsf{e}_\min$ denotes the minimum of the +sizes of the CDAWGs for $S$ and for its reversal $S^R$. For any strings of +length $n$, it holds that $\mathsf{e}_\min < 2n$, and for highly repetitive +strings $\mathsf{e}_\min$ can be sublinear (up to logarithmic) in $n$. We also +show that MAWs and their generalization minimal rare words have close +relationships with extended bispecial factors, via the CDAWG. + +
+
+ comment: Accepted for IWOCA 2024 +
+
+
+
+
+
+
+
+ + Hardware Architecturea 2 + +
+
+
+ + ☆ OFHE: An Electro-Optical Accelerator for Discretized TFHE + + +
+ This paper presents \textit{OFHE}, an electro-optical accelerator designed to +process Discretized TFHE (DTFHE) operations, which encrypt multi-bit messages +and support homomorphic multiplications, lookup table operations and +full-domain functional bootstrappings. While DTFHE is more efficient and +versatile than other fully homomorphic encryption schemes, it requires 32-, +64-, and 128-bit polynomial multiplications, which can be time-consuming. +Existing TFHE accelerators are not easily upgradable to support DTFHE +operations due to limited datapaths, a lack of datapath bit-width +reconfigurability, and power inefficiencies when processing FFT and inverse FFT +(IFFT) kernels. Compared to prior TFHE accelerators, OFHE addresses these +challenges by improving the DTFHE operation latency by 8.7\%, the DTFHE +operation throughput by $57\%$, and the DTFHE operation throughput per Watt by +$94\%$. + +
+
+
+
+
+ + ☆ Enabling full-speed random access to the entire memory on the A100 GPU + + +
+ We describe some features of the A100 memory architecture. In particular, we +give a technique to reverse-engineer some hardware layout information. Using +this information, we show how to avoid TLB issues to obtain full-speed random +HBM access to the entire memory, as long as we constrain any particular thread +to a reduced access window of less than 64GB. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+
+
+
+ + Performance Profiling 1 + +
+
+
+ + ☆ Enabling full-speed random access to the entire memory on the A100 GPU + + +
+ We describe some features of the A100 memory architecture. In particular, we +give a technique to reverse-engineer some hardware layout information. Using +this information, we show how to avoid TLB issues to obtain full-speed random +HBM access to the entire memory, as long as we constrain any particular thread +to a reduced access window of less than 64GB. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+
+
+
+ + Computational Complexity 3 + +
+
+
+ + ☆ Fixed-parameter tractability of canonical polyadic decomposition over + finite fields + + +
+ We present a simple proof that finding a rank-$R$ canonical polyadic +decomposition of 3-dimensional tensors over a finite field $\mathbb{F}$ is +fixed-parameter tractable with respect to $R$ and $\mathbb{F}$. We also show +some more concrete upper bounds on the time complexity of this problem. + +
+
+ comment: 8 pages; some proofs copied from arXiv:2401.06857 +
+
+
+
+
+ + ♻ ☆ A Smoothed FPTAS for Equilibria in Congestion Games + + +
+ We present a fully polynomial-time approximation scheme (FPTAS) for computing +equilibria in congestion games, under smoothed running-time analysis. More +precisely, we prove that if the resource costs of a congestion game are +randomly perturbed by independent noises, whose density is at most $\phi$, then +any sequence of $(1+\varepsilon)$-improving dynamics will reach an +$(1+\varepsilon)$-approximate pure Nash equilibrium (PNE) after an expected +number of steps which is strongly polynomial in $\frac{1}{\varepsilon}$, +$\phi$, and the size of the game's description. Our results establish a sharp +contrast to the traditional worst-case analysis setting, where it is known that +better-response dynamics take exponentially long to converge to +$\alpha$-approximate PNE, for any constant factor $\alpha\geq 1$. As a matter +of fact, computing $\alpha$-approximate PNE in congestion games is PLS-hard. + We demonstrate how our analysis can be applied to various different models of +congestion games including general, step-function, and polynomial cost, as well +as fair cost-sharing games (where the resource costs are decreasing). It is +important to note that our bounds do not depend explicitly on the cardinality +of the players' strategy sets, and thus the smoothed FPTAS is readily +applicable to network congestion games as well. + +
+
+ comment: To appear at EC'24. Simplified analysis and improved bound in Lemma + 1. Improved bound at Eq. (11). These result in improved smoothed running time + bounds for all our congestion game models (i.e. Sections 3.2, 3.3.1, 3.3.2, + and 3.3.3) +
+
+
+
+
+ + ♻ ☆ On the Efficiency of An Election Game of Two or More Parties: How Bad + Can It Be? + + +
+ An election campaign among two or more parties can be viewed as a game of two +or more players, each of which has its own candidates as the pure strategies. +People, as voters, comprise supporters for each party, and a candidate brings +utility for the supporters of each party. Each party nominates exactly one of +its candidates to compete against the other party's. A candidate is assumed to +win the election with greater or equal odds if it brings more utility for all +the people. The payoff of each player is the expected utility that its +supporters get. The game is egoistic if every candidate benefits its party's +supporters more than any candidate from a competing party does. In this paper, +we first prove that it is NP-complete to determine whether an election game in +a succinct representation, which is called the general form, has a +pure-strategy Nash equilibrium even if it is egoistic. Next, we propose a +fixed-parameter tractable algorithm to compute a pure-strategy Nash equilibrium +of an egoistic election game and show that a naive constant time algorithm +leads to a (1+e)-approximate pure-strategy Nash equilibrium when the winning +probability is computed by a softmax function. Finally, perhaps surprisingly, +we show that the price of anarchy for egoistic election games is upper bounded +by the number of parties. Our results suggest that an election becomes +unpredictable in terms of stability and efficiency when more than two parties +are involved, and, to some extent, also provides supporting arguments for why +the two-party system is prevalent in democratic countries. + +
+
+ comment: A previous version appeared at the 6th Games, Agents, and Incentives + Workshop (GAIW-24). The current version has been submitted to SAGT 2024 +
+
+
+
+
+
+
+
+ + Logic in Computer Science 3 + +
+
+
+ + ☆ On the Expressivity of Recurrent Neural Cascades with Identity + + +
+ Recurrent Neural Cascades (RNC) are the class of recurrent neural networks +with no cyclic dependencies among recurrent neurons. Their subclass RNC+ with +positive recurrent weights has been shown to be closely connected to the +star-free regular languages, which are the expressivity of many +well-established temporal logics. The existing expressivity results show that +the regular languages captured by RNC+ are the star-free ones, and they leave +open the possibility that RNC+ may capture languages beyond regular. We exclude +this possibility for languages that include an identity element, i.e., an input +that can occur an arbitrary number of times without affecting the output. +Namely, in the presence of an identity element, we show that the languages +captured by RNC+ are exactly the star-free regular languages. Identity elements +are ubiquitous in temporal patterns, and hence our results apply to a large +number of applications. The implications of our results go beyond expressivity. +At their core, we establish a close structural correspondence between RNC+ and +semiautomata cascades, showing that every neuron can be equivalently captured +by a three-state semiautomaton. A notable consequence of this result is that +RNC+ are no more succinct than cascades of three-state semiautomata. + +
+
+
+
+
+ + ☆ Completeness of two fragments of a logic for conditional strategic + reasoning + + +
+ Classical logics for strategic reasoning, such as Coalition Logic and +Alternating-time Temporal Logic, formalize absolute strategic reasoning about +the unconditional strategic abilities of agents to achieve their goals. Goranko +and Ju introduced a logic ConStR for strategic reasoning about conditional +strategic abilities. However, its completeness is still an open problem. ConStR +has three featured operators, and one of them has the following reading: For +some action of A that guarantees the achievement of her goal, B has an action +to guarantee the achievement of his goal. The logic about this operator is +called CConStR. In this paper, we prove completeness for two fragments of +CConStR. The key notions of our proof approach include downward validity lemma, +grafted models, and upward derivability lemma. The proof approach has good +potential to be applied to the completeness of ConStR and other logics. + +
+
+
+
+
+ + ♻ ☆ Are Targeted Messages More Effective? + + +
+ Graph neural networks (GNN) are deep learning architectures for graphs. +Essentially, a GNN is a distributed message passing algorithm, which is +controlled by parameters learned from data. It operates on the vertices of a +graph: in each iteration, vertices receive a message on each incoming edge, +aggregate these messages, and then update their state based on their current +state and the aggregated messages. The expressivity of GNNs can be +characterised in terms of certain fragments of first-order logic with counting +and the Weisfeiler-Lehman algorithm. + The core GNN architecture comes in two different versions. In the first +version, a message only depends on the state of the source vertex, whereas in +the second version it depends on the states of the source and target vertices. +In practice, both of these versions are used, but the theory of GNNs so far +mostly focused on the first one. On the logical side, the two versions +correspond to two fragments of first-order logic with counting that we call +modal and guarded. + The question whether the two versions differ in their expressivity has been +mostly overlooked in the GNN literature and has only been asked recently +(Grohe, LICS'23). We answer this question here. It turns out that the answer is +not as straightforward as one might expect. By proving that the modal and +guarded fragment of first-order logic with counting have the same expressivity +over labelled undirected graphs, we show that in a non-uniform setting the two +GNN versions have the same expressivity. However, we also prove that in a +uniform setting the second version is strictly more expressive. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Distributed, Parallel, and Cluster Computing 10 + +
+
+
+ + ☆ Security of Cloud Services with Low-Performance Devices in Critical + Infrastructures + + +
+ As part of the Internet of Things (IoT) and Industry 4.0 Cloud services are +increasingly interacting with low-performance devices that are used in +automation. This results in security issues that will be presented in this +paper. Particular attention is paid to so-called critical infrastructures. The +authors intend to work on the addressed security challenges as part of a funded +research project, using electrical actuators and battery storages as specific +applications. The core ideas of this research project are also presented in +this paper. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Cloud Security and Security Challenges Revisited + + +
+ In recent years, Cloud Computing has transformed local businesses and created +new business models on the Internet- and Cloud services are still flourishing. +But after the emphatic hype in the early years, a more realistic perception of +Cloud services has emerged. One reason for this surely is that today, Cloud +Computing is considered as an established and well-accepted technology and no +longer as a technical novelty. But the second reason for this assessment might +also be numerous security issues that Cloud Computing in general or specific +Cloud services have experienced since then. In this paper, we revisit attacks +on Cloud services and Cloud-related attack vectors that have been published in +recent years. We then consider successful or proposed solutions to cope with +these challenges. Based on these findings, we apply a security metric in order +to rank all these Cloud-related security challenges concerning their severity. +This should assist security professionals to prioritize their efforts toward +addressing these issues. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ A Secure and Privacy-Friendly Logging Scheme + + +
+ Finding a robust security mechanism for audit trail logging has long been a +poorly satisfied goal. There are many reasons for this. The most significant of +these is that the audit trail is a highly sought after goal of attackers to +ensure that they do not get caught. Thus they have an incredibly strong +incentive to prevent companies from succeeding in this worthy aim. Regulation, +such as the European Union General Data Protection Regulation, has brought a +strong incentive for companies to achieve success in this area due to the +punitive level of fines that can now be levied in the event of a successful +breach by an attacker. We seek to resolve this issue through the use of an +encrypted audit trail process that saves encrypted records to a true immutable +database, which can ensure audit trail records are permanently retained in +encrypted form, with no possibility of the records being compromised. This +ensures compliance with the General Data Protection Regulation can be achieved. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Securing 3rd Party App Integration in Docker-based Cloud Software + Ecosystems + + +
+ Open software ecosystems are beneficial for customers; they benefit from 3rd +party services and applications, e.g. analysis of data using apps, developed +and deployed by other companies or open-source communities. One significant +advantage of this approach is that other customers may benefit from these newly +developed applications as well. Especially software ecosystems utilizing +container technologies are prone to certain risks. Docker, in particular, is +more vulnerable to attacks than hypervisor based virtualisation as it directly +operates on the host system. Docker is a popular representative of +containerisation technology which offers a lightweight architecture in order to +facilitate the set-up and creation of such software ecosystems. Popular +Infrastructure as a Service cloud service providers, like Amazon Web Services +or Microsoft Azure, jump on the containerisation bandwagon and provide +interfaces for provisioning and managing containers. Companies can benefit from +that change of technology and create software ecosystems more efficiently. In +this paper, we present a new concept for significant security improvements for +cloud-based software ecosystems using Docker for 3rd party app integration. +Based on the security features of Docker we describe a secure integration of +applications in the cloud environment securely. Our approach considers the +whole software lifecycle and includes sandbox testing of potentially dangerous +3rd party apps before these became available to the customers. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Cooperative Cognitive Dynamic System in UAV Swarms: Reconfigurable + Mechanism and Framework + + +
+ As the demands for immediate and effective responses increase in both +civilian and military domains, the unmanned aerial vehicle (UAV) swarms emerge +as effective solutions, in which multiple cooperative UAVs can work together to +achieve specific goals. However, how to manage such complex systems to ensure +real-time adaptability lack sufficient researches. Hence, in this paper, we +propose the cooperative cognitive dynamic system (CCDS), to optimize the +management for UAV swarms. CCDS leverages a hierarchical and cooperative +control structure that enables real-time data processing and decision. +Accordingly, CCDS optimizes the UAV swarm management via dynamic +reconfigurability and adaptive intelligent optimization. In addition, CCDS can +be integrated with the biomimetic mechanism to efficiently allocate tasks for +UAV swarms. Further, the distributed coordination of CCDS ensures reliable and +resilient control, thus enhancing the adaptability and robustness. Finally, the +potential challenges and future directions are analyzed, to provide insights +into managing UAV swarms in dynamic heterogeneous networking. + +
+
+
+
+
+ + ☆ MultiPaxos Made Complete + + +
+ MultiPaxos, while a fundamental Replicated State Machine algorithm, suffers +from a dearth of comprehensive guidelines for achieving a complete and correct +implementation. This deficiency has hindered MultiPaxos' practical utility and +adoption and has resulted in flawed claims about its capabilities. Our paper +aims to bridge the gap between MultiPaxos' complexity and practical +implementation through a meticulous and detailed design process spanning more +than a year. It carefully dissects each phase of MultiPaxos and offers detailed +step-by-step pseudocode -- in addition to a complete open-source implementation +-- for all components, including the leader election, the failure detector, and +the commit phase. + The implementation of our complete design also provides better performance +stability, resource usage, and network partition tolerance than naive +MultiPaxos versions. Our specification includes a lightweight log compaction +approach that avoids taking repeated snapshots, significantly improving +resource usage and performance stability. Our failure detector, integrated into +the commit phase of the algorithm, uses variable and adaptive heartbeat +intervals to settle on a better leader under partial connectivity and network +partitions, improving liveness under such conditions. + +
+
+
+
+
+ + ☆ The Cost of Garbage Collection for State Machine Replication + + +
+ State Machine Replication (SMR) protocols form the backbone of many +distributed systems. Enterprises and startups increasingly build their +distributed systems on the cloud due to its many advantages, such as +scalability and cost-effectiveness. One of the first technical questions +companies face when building a system on the cloud is which programming +language to use. Among many factors that go into this decision is whether to +use a language with garbage collection (GC), such as Java or Go, or a language +with manual memory management, such as C++ or Rust. Today, companies +predominantly prefer languages with GC, like Go, Kotlin, or even Python, due to +ease of development; however, there is no free lunch: GC costs resources +(memory and CPU) and performance (long tail latencies due to GC pauses). While +there have been anecdotal reports of reduced cloud cost and improved tail +latencies when switching from a language with GC to a language with manual +memory management, so far, there has not been a systematic study of the GC +overhead of running an SMR-based cloud system. + This paper studies the overhead of running an SMR-based cloud system written +in a language with GC. To this end, we design from scratch a canonical SMR +system -- a MultiPaxos-based replicated in-memory key-value store -- and we +implement it in C++, Java, Rust, and Go. We compare the performance and +resource usage of these implementations when running on the cloud under +different workloads and resource constraints and report our results. Our +findings have implications for the design of cloud systems. + +
+
+ comment: 17 pages, 14 figures +
+
+
+
+
+ + ☆ Towards Specialized Supercomputers for Climate Sciences: Computational + Requirements of the Icosahedral Nonhydrostatic Weather and Climate Model + + +
+ We discuss the computational challenges and requirements for high-resolution +climate simulations using the Icosahedral Nonhydrostatic Weather and Climate +Model (ICON). We define a detailed requirements model for ICON which emphasizes +the need for specialized supercomputers to accurately predict climate change +impacts and extreme weather events. Based on the requirements model, we outline +computational demands for km-scale simulations, and suggests machine learning +techniques to enhance model accuracy and efficiency. Our findings aim to guide +the design of future supercomputers for advanced climate science. + +
+
+
+
+
+ + ♻ ☆ Encrypted Container File: Design and Implementation of a + Hybrid-Encrypted Multi-Recipient File Structure + + +
+ Modern software engineering trends towards Cloud-native software development +by international teams of developers. Cloud-based version management services, +such as GitHub, are used for the source code and other artifacts created during +the development process. However, using such a service usually means that every +developer has access to all data stored on the platform. Particularly, if the +developers belong to different companies or organizations, it would be +desirable for sensitive files to be encrypted in such a way that these can only +be decrypted again by a group of previously defined people. In this paper, we +examine currently available tools that address this problem, but which have +certain shortcomings. We then present our own solution, Encrypted Container +Files (ECF), for this problem, eliminating the deficiencies found in the other +tools. + +
+
+ comment: 7 pages, for associated implementation etc., see + https://github.com/Hirnmoder/ECF +
+
+
+
+
+ + ♻ ☆ Accelerating Hybrid Federated Learning Convergence under Partial + Participation + + +
+ Over the past few years, Federated Learning (FL) has become a popular +distributed machine learning paradigm. FL involves a group of clients with +decentralized data who collaborate to learn a common model under the +coordination of a centralized server, with the goal of protecting clients' +privacy by ensuring that local datasets never leave the clients and that the +server only performs model aggregation. However, in realistic scenarios, the +server may be able to collect a small amount of data that approximately mimics +the population distribution and has stronger computational ability to perform +the learning process. To address this, we focus on the hybrid FL framework in +this paper. While previous hybrid FL work has shown that the alternative +training of clients and server can increase convergence speed, it has focused +on the scenario where clients fully participate and ignores the negative effect +of partial participation. In this paper, we provide theoretical analysis of +hybrid FL under clients' partial participation to validate that partial +participation is the key constraint on convergence speed. We then propose a new +algorithm called FedCLG, which investigates the two-fold role of the server in +hybrid FL. Firstly, the server needs to process the training steps using its +small amount of local datasets. Secondly, the server's calculated gradient +needs to guide the participated clients' training and the server's aggregation. +We validate our theoretical findings through numerical experiments, which show +that our proposed method FedCLG outperforms state-of-the-art methods. + +
+
+ comment: Accepted by IEEE Transactions on Signal Processing, Update the + convergence analysis and add more experiment results +
+
+
+
+
+
+
+
+ + Computational Complexity 4 + +
+
+
+ + ☆ Inner-approximate Reachability Computation via Zonotopic Boundary + Analysis + + +
+ Inner-approximate reachability analysis involves calculating subsets of +reachable sets, known as inner-approximations. This analysis is crucial in the +fields of dynamic systems analysis and control theory as it provides a reliable +estimation of the set of states that a system can reach from given initial +states at a specific time instant. In this paper, we study the +inner-approximate reachability analysis problem based on the set-boundary +reachability method for systems modelled by ordinary differential equations, in +which the computed inner-approximations are represented with zonotopes. The +set-boundary reachability method computes an inner-approximation by excluding +states reached from the initial set's boundary. The effectiveness of this +method is highly dependent on the efficient extraction of the exact boundary of +the initial set. To address this, we propose methods leveraging boundary and +tiling matrices that can efficiently extract and refine the exact boundary of +the initial set represented by zonotopes. Additionally, we enhance the +exclusion strategy by contracting the outer-approximations in a flexible way, +which allows for the computation of less conservative inner-approximations. To +evaluate the proposed method, we compare it with state-of-the-art methods +against a series of benchmarks. The numerical results demonstrate that our +method is not only efficient but also accurate in computing +inner-approximations. + +
+
+ comment: the full version of the paper accepted by CAV 2024 +
+
+
+
+
+ + ♻ ☆ P=NP + + +
+ This paper investigates an extremely classic NP-complete problem: How to +determine if a graph G, where each vertex has a degree of at most 4, can be +3-colorable(The research in this paper focuses on graphs G that satisfy the +condition where the degree of each vertex does not exceed 4. To conserve space, +it is assumed throughout the paper that graph G meets this condition by +default.). The author has meticulously observed the relationship between the +coloring problem and semidefinite programming, and has creatively constructed +the corresponding semidefinite programming problem R(G) for a given graph G. +The construction method of R(G) refers to Theorem 1.1 in the paper. I have +obtained and proven the conclusion: A graph G is 3-colorable if and only if the +objective function of its corresponding optimization problem R(G) is bounded, +and when the objective function is bounded, its minimum value is 0. + +
+
+
+
+
+ + ♻ ☆ On Probabilistic and Causal Reasoning with Summation Operators + + +
+ Ibeling et al. (2023). axiomatize increasingly expressive languages of +causation and probability, and Mosse et al. (2024) show that reasoning +(specifically the satisfiability problem) in each causal language is as +difficult, from a computational complexity perspective, as reasoning in its +merely probabilistic or "correlational" counterpart. Introducing a summation +operator to capture common devices that appear in applications -- such as the +$do$-calculus of Pearl (2009) for causal inference, which makes ample use of +marginalization -- van der Zander et al. (2023) partially extend these earlier +complexity results to causal and probabilistic languages with marginalization. +We complete this extension, fully characterizing the complexity of +probabilistic and causal reasoning with summation, demonstrating that these +again remain equally difficult. Surprisingly, allowing free variables for +random variable values results in a system that is undecidable, so long as the +ranges of these random variables are unrestricted. We finally axiomatize these +languages featuring marginalization (or more generally summation), resolving +open questions posed by Ibeling et al. (2023). + +
+
+
+
+
+ + ♻ ☆ On the complexity of symmetric vs. functional PCSPs + + +
+ The complexity of the promise constraint satisfaction problem +$\operatorname{PCSP}(\mathbf{A},\mathbf{B})$ is largely unknown, even for +symmetric $\mathbf{A}$ and $\mathbf{B}$, except for the case when $\mathbf{A}$ +and $\mathbf{B}$ are Boolean. + First, we establish a dichotomy for +$\operatorname{PCSP}(\mathbf{A},\mathbf{B})$ where $\mathbf{A}, \mathbf{B}$ are +symmetric, $\mathbf{B}$ is functional (i.e. any $r-1$ elements of an $r$-ary +tuple uniquely determines the last one), and $(\mathbf{A},\mathbf{B})$ +satisfies technical conditions we introduce called dependency and additivity. +This result implies a dichotomy for +$\operatorname{PCSP}(\mathbf{A},\mathbf{B})$ with $\mathbf{A},\mathbf{B}$ +symmetric and $\mathbf{B}$ functional if (i) $\mathbf{A}$ is Boolean, or (ii) +$\mathbf{A}$ is a hypergraph of a small uniformity, or (iii) $\mathbf{A}$ has a +relation $R^{\mathbf{A}}$ of arity at least 3 such that the hypergraph diameter +of $(A, R^{\mathbf{A}})$ is at most 1. + Second, we show that for $\operatorname{PCSP}(\mathbf{A},\mathbf{B})$, where +$\mathbf{A}$ and $\mathbf{B}$ contain a single relation, $\mathbf{A}$ satisfies +a technical condition called balancedness, and $\mathbf{B}$ is arbitrary, the +combined basic linear programming relaxation (BLP) and the affine integer +programming relaxation (AIP) is no more powerful than the (in general strictly +weaker) AIP relaxation. Balanced $\mathbf{A}$ include symmetric $\mathbf{A}$ +or, more generally, $\mathbf{A}$ preserved by a transitive permutation group. + +
+
+ comment: Full version (with stronger results) of a LICS'23 paper +
+
+
+
+
+
+
+
+ + Logic in Computer Science 6 + +
+
+
+ + ☆ SMT-based Symbolic Model-Checking for Operator Precedence Languages + + +
+ Operator Precedence Languages (OPL) have been recently identified as a +suitable formalism for model checking recursive procedural programs, thanks to +their ability of modeling the program stack. OPL requirements can be expressed +in the Precedence Oriented Temporal Logic (POTL), which features modalities to +reason on the natural matching between function calls and returns, exceptions, +and other advanced programming constructs that previous approaches, such as +Visibly Pushdown Languages, cannot model effectively. Existing approaches for +model checking of POTL have been designed following the explicit-state, +automata-based approach, a feature that severely limits their scalability. In +this paper, we give the first symbolic, SMT-based approach for model checking +POTL properties. While previous approaches construct the automaton for both the +POTL formula and the model of the program, we encode them into a (sequence of) +SMT formulas. The search of a trace of the model witnessing a violation of the +formula is then carried out by an SMT-solver, in a Bounded Model Checking +fashion. We carried out an experimental evaluation, which shows the +effectiveness of the proposed solution. + +
+
+ comment: 30 pages, 6 figures +
+
+
+
+
+ + ☆ Concurrent Games over Relational Structures: The Origin of Game Comonads + + +
+ Spoiler-Duplicator games are used in finite model theory to examine the +expressive power of logics. Their strategies have recently been reformulated as +coKleisli maps of game comonads over relational structures, providing new +results in finite model theory via categorical techniques. We present a novel +framework for studying Spoiler-Duplicator games by viewing them as event +structures. We introduce a first systematic method for constructing comonads +for all one-sided Spoiler-Duplicator games: game comonads are now realised by +adjunctions to a category of games, generically constructed from a comonad in a +bicategory of game schema (called signature games). Maps of the constructed +categories of games are strategies and generalise coKleisli maps of game +comonads; in the case of one-sided games they are shown to coincide with +suitably generalised homomorphisms. Finally, we provide characterisations of +strategies on two-sided Spoiler-Duplicator games; in a common special case they +coincide with spans of event structures. + +
+
+ comment: Extended version of the paper in Logic in Computer Science (LICS) + 2024 Proceedings +
+
+
+
+
+ + ☆ Propositional dynamic logic and asynchronous cascade decompositions for + regular trace languages + + +
+ We propose a local, past-oriented fragment of propositional dynamic logic to +reason about concurrent scenarios modelled as Mazurkiewicz traces, and prove it +to be expressively complete with respect to regular trace languages. Because of +locality, specifications in this logic are efficiently translated into +asynchronous automata, in a way that reflects the structure of formulas. In +particular, we obtain a new proof of Zielonka's fundamental theorem and we +prove that any regular trace language can be implemented by a cascade product +of localized asynchronous automata, which essentially operate on a single +process. + These results refine earlier results by Adsul et al. which involved a larger +fragment of past propositional dynamic logic and used Mukund and Sohoni's +gossip automaton. Our new results avoid using this automaton, or Zielonka's +timestamping mechanism and, in particular, they show how to implement a gossip +automaton as a cascade product. + +
+
+ comment: 13 pages. Accepted for publication at LICS 2024 +
+
+
+
+
+ + ♻ ☆ On Probabilistic and Causal Reasoning with Summation Operators + + +
+ Ibeling et al. (2023). axiomatize increasingly expressive languages of +causation and probability, and Mosse et al. (2024) show that reasoning +(specifically the satisfiability problem) in each causal language is as +difficult, from a computational complexity perspective, as reasoning in its +merely probabilistic or "correlational" counterpart. Introducing a summation +operator to capture common devices that appear in applications -- such as the +$do$-calculus of Pearl (2009) for causal inference, which makes ample use of +marginalization -- van der Zander et al. (2023) partially extend these earlier +complexity results to causal and probabilistic languages with marginalization. +We complete this extension, fully characterizing the complexity of +probabilistic and causal reasoning with summation, demonstrating that these +again remain equally difficult. Surprisingly, allowing free variables for +random variable values results in a system that is undecidable, so long as the +ranges of these random variables are unrestricted. We finally axiomatize these +languages featuring marginalization (or more generally summation), resolving +open questions posed by Ibeling et al. (2023). + +
+
+
+
+
+ + ♻ ☆ Algebraic Reasoning Meets Automata in Solving Linear Integer Arithmetic + (Technical Report) + + +
+ We present a new angle on solving quantified linear integer arithmetic based +on combining the automata-based approach, where numbers are understood as +bitvectors, with ideas from (nowadays prevalent) algebraic approaches, which +work directly with numbers. This combination is enabled by a fine-grained +version of the duality between automata and arithmetic formulae. In particular, +we employ a construction where states of automaton are obtained as derivatives +of arithmetic formulae: then every state corresponds to a formula. +Optimizations based on techniques and ideas transferred from the world of +algebraic methods are used on thousands of automata states, which dramatically +amplifies their effect. The merit of this combination of automata with +algebraic methods is demonstrated by our prototype implementation being +competitive to and even superior to state-of-the-art SMT solvers. + +
+
+ comment: Accepted to CAV'24 +
+
+
+
+
+ + ♻ ☆ A Proof-theoretic Semantics for Intuitionistic Linear Logic + + +
+ The approach taken by Gheorghiu, Gu and Pym in their paper on giving a +Base-extension Semantics for Intuitionistic Multiplicative Linear Logic is an +interesting adaptation of the work of Sandqvist for IPL to the substructural +setting. What is particularly interesting is how naturally the move to the +substructural setting provided a semantics for the multiplicative fragment of +intuitionistic linear logic. Whilst ultimately the Gheorghiu, Gu and Pym used +their foundations to provide a semantics for bunched implication logic, it begs +the question, what of the rest of intuitionistic linear logic? In this paper, I +present just such a semantics. This is particularly of interest as this logic +has as a connective the bang, a modal connective. Capturing the inferentialist +content of formulas marked with this connective is particularly challenging and +a discussion is dedicated to this at the end of the paper. + +
+
+ comment: 28 pages +
+
+
+
+
+
+
+
+ + Hardware Architecturea 2 + +
+
+
+ + ☆ NTTSuite: Number Theoretic Transform Benchmarks for Accelerating + Encrypted Computation + + +
+ Privacy concerns have thrust privacy-preserving computation into the +spotlight. Homomorphic encryption (HE) is a cryptographic system that enables +computation to occur directly on encrypted data, providing users with strong +privacy (and security) guarantees while using the same services they enjoy +today unprotected. While promising, HE has seen little adoption due to +extremely high computational overheads, rendering it impractical. Homomorphic +encryption (HE) is a cryptographic system that enables computation to occur +directly on encrypted data. In this paper we develop a benchmark suite, named +NTTSuite, to enable researchers to better address these overheads by studying +the primary source of HE's slowdown: the number theoretic transform (NTT). +NTTSuite constitutes seven unique NTT algorithms with support for CPUs (C++), +GPUs (CUDA), and custom hardware (Catapult HLS).In addition, we propose +optimizations to improve the performance of NTT running on FPGAs. We find our +implementation outperforms the state-of-the-art by 30%. + +
+
+ comment: 8 pages, 5 figures, and two tables. To download the source code, see + https://github.com/Dragon201701/NTTSuite +
+
+
+
+
+ + ☆ Towards Specialized Supercomputers for Climate Sciences: Computational + Requirements of the Icosahedral Nonhydrostatic Weather and Climate Model + + +
+ We discuss the computational challenges and requirements for high-resolution +climate simulations using the Icosahedral Nonhydrostatic Weather and Climate +Model (ICON). We define a detailed requirements model for ICON which emphasizes +the need for specialized supercomputers to accurately predict climate change +impacts and extreme weather events. Based on the requirements model, we outline +computational demands for km-scale simulations, and suggests machine learning +techniques to enhance model accuracy and efficiency. Our findings aim to guide +the design of future supercomputers for advanced climate science. + +
+
+
+
+
+
+
+
+ + Programming and Languages 4 + +
+
+
+ + ☆ An Opportunistically Parallel Lambda Calculus for Performant Composition + of Large Language Models + + +
+ Large language models (LLMs) have shown impressive results at a wide-range of +tasks. However, they have limitations, such as hallucinating facts and +struggling with arithmetic. Recent work has addressed these issues with +sophisticated decoding techniques. However, performant decoding, particularly +for sophisticated techniques, relies crucially on parallelization and batching, +which are difficult for developers. + We make two observations: 1) existing approaches are high-level +domain-specific languages for gluing expensive black-box calls, but are not +general or compositional; 2) LLM programs are essentially pure (all effects +commute). Guided by these observations, we develop a novel, general-purpose +lambda calculus for automatically parallelizing a wide-range of LLM +interactions, without user intervention. The key difference versus standard +lambda calculus is a novel "opportunistic" evaluation strategy, which steps +independent parts of a program in parallel, dispatching black-box external +calls as eagerly as possible, even while data-independent parts of the program +are waiting for their own external calls to return. To maintain the simplicity +of the language and to ensure uniformity of opportunistic evaluation, +control-flow and looping constructs are implemented in-language, via Church +encodings. + We implement this approach in a framework called EPIC, embedded in--and +interoperating closely with--Python. We demonstrate its versatility and +performance with three case studies drawn from the machine learning literature: +Tree-of-Thoughts (LLMs embedded in classic search procedures), nested tool use, +and constrained decoding. Our experiments show that opportunistic evaluation +offers a $1.5\times$ to $4.8\times$ speedup over sequential evaluation, while +still allowing practitioners to write straightforward and composable programs, +without any manual parallelism or batching. + +
+
+
+
+
+ + ☆ Concurrent Games over Relational Structures: The Origin of Game Comonads + + +
+ Spoiler-Duplicator games are used in finite model theory to examine the +expressive power of logics. Their strategies have recently been reformulated as +coKleisli maps of game comonads over relational structures, providing new +results in finite model theory via categorical techniques. We present a novel +framework for studying Spoiler-Duplicator games by viewing them as event +structures. We introduce a first systematic method for constructing comonads +for all one-sided Spoiler-Duplicator games: game comonads are now realised by +adjunctions to a category of games, generically constructed from a comonad in a +bicategory of game schema (called signature games). Maps of the constructed +categories of games are strategies and generalise coKleisli maps of game +comonads; in the case of one-sided games they are shown to coincide with +suitably generalised homomorphisms. Finally, we provide characterisations of +strategies on two-sided Spoiler-Duplicator games; in a common special case they +coincide with spans of event structures. + +
+
+ comment: Extended version of the paper in Logic in Computer Science (LICS) + 2024 Proceedings +
+
+
+
+
+ + ☆ Strided Difference Bound Matrices + + +
+ A wide range of symbolic analysis and optimization problems can be formalized +using polyhedra. Sub-classes of polyhedra, also known as sub-polyhedral +domains, are sought for their lower space and time complexity. We introduce the +Strided Difference Bound Matrix (SDBM) domain, which represents a sweet spot in +the context of optimizing compilers. Its expressiveness and efficient +algorithms are particularly well suited to the construction of machine learning +compilers. We present decision algorithms, abstract domain operators and +computational complexity proofs for SDBM. We also conduct an empirical study +with the MLIR compiler framework to validate the domain's practical +applicability. We characterize a sub-class of SDBMs that frequently occurs in +practice, and demonstrate even faster algorithms on this sub-class. + +
+
+ comment: Preprint and extended from the CAV 2024 conference version +
+
+
+
+
+ + ☆ Parsimonious Optimal Dynamic Partial Order Reduction + + +
+ Stateless model checking is a fully automatic verification technique for +concurrent programs that checks for safety violations by exploring all possible +thread schedulings. It becomes effective when coupled with Dynamic Partial +Order Reduction (DPOR), which introduces an equivalence on schedulings and +reduces the amount of needed exploration. DPOR algorithms that are +\emph{optimal} are particularly effective in that they guarantee to explore +\emph{exactly} one execution from each equivalence class. Unfortunately, +existing sequence-based optimal algorithms may in the worst case consume memory +that is exponential in the size of the analyzed program. In this paper, we +present Parsimonious-OPtimal (POP) DPOR, an optimal DPOR algorithm for +analyzing multi-threaded programs under sequential consistency, whose space +consumption is polynomial in the worst case. POP combines several novel +algorithmic techniques, including (i) a parsimonious race reversal strategy, +which avoids multiple reversals of the same race, (ii) an eager race reversal +strategy to avoid storing initial fragments of to-be-explored executions, and +(iii) a space-efficient scheme for preventing redundant exploration, which +replaces the use of sleep sets. Our implementation in Nidhugg shows that these +techniques can significantly speed up the analysis of concurrent programs, and +do so with low memory consumption. Comparison to a related optimal DPOR +algorithm for a different representation of concurrent executions as graphs +shows that POP has comparable worst-case performance for smaller benchmarks and +outperforms the other one for larger programs. + +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 1 + +
+
+
+ + ☆ Propositional dynamic logic and asynchronous cascade decompositions for + regular trace languages + + +
+ We propose a local, past-oriented fragment of propositional dynamic logic to +reason about concurrent scenarios modelled as Mazurkiewicz traces, and prove it +to be expressively complete with respect to regular trace languages. Because of +locality, specifications in this logic are efficiently translated into +asynchronous automata, in a way that reflects the structure of formulas. In +particular, we obtain a new proof of Zielonka's fundamental theorem and we +prove that any regular trace language can be implemented by a cascade product +of localized asynchronous automata, which essentially operate on a single +process. + These results refine earlier results by Adsul et al. which involved a larger +fragment of past propositional dynamic logic and used Mukund and Sohoni's +gossip automaton. Our new results avoid using this automaton, or Zielonka's +timestamping mechanism and, in particular, they show how to implement a gossip +automaton as a cascade product. + +
+
+ comment: 13 pages. Accepted for publication at LICS 2024 +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`