diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..d1e23527 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-09-12T00:00:00Z":{"Performance Profiling":[{"id":"http://arxiv.org/abs/2407.06894v2","updated":"2024-09-12T06:20:07Z","published":"2024-07-09T14:25:47Z","title":"RIS-Assisted Received Adaptive Spatial Modulation for Wireless\n Communication","summary":" A novel wireless transmission scheme, as named the reconfigurable intelligent\nsurface (RIS)-assisted received adaptive spatial modulation (RASM) scheme, is\nproposed in this paper. In this scheme, the adaptive spatial modulation\n(ASM)-based antennas selection works at the receiver by employing the\ncharacteristics of the RIS in each time slot, where the signal-to-noise ratio\nat specific selected antennas can be further enhanced with near few powers.\nBesides for the bits from constellation symbols, the extra bits can be mapped\ninto the indices of receive antenna combinations and conveyed to the receiver\nthrough the ASM-based antenna-combination selection, thus providing higher\nspectral efficiency. To explicitly present the RASM scheme, the analytical\nperformance of bit error rate of it is discussed in this paper. As a trade-off\nselection, the proposed scheme shows higher spectral efficiency and remains the\nsatisfactory error performance. Simulation and analytical results demonstrate\nthe better performance and exhibit more potential to apply in practical\nwireless communication.\n","authors":["Chaorong Zhang","Hui Xu","Benjamin K. Ng","Chan-Tong Lam","Ke Wang"],"pdf_url":"https://arxiv.org/pdf/2407.06894v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08115v1","updated":"2024-09-12T15:07:16Z","published":"2024-09-12T15:07:16Z","title":"Anonymized Network Sensing Graph Challenge","summary":" The MIT/IEEE/Amazon GraphChallenge encourages community approaches to\ndeveloping new solutions for analyzing graphs and sparse data derived from\nsocial media, sensor feeds, and scientific data to discover relationships\nbetween events as they unfold in the field. The anonymized network sensing\nGraph Challenge seeks to enable large, open, community-based approaches to\nprotecting networks. Many large-scale networking problems can only be solved\nwith community access to very broad data sets with the highest regard for\nprivacy and strong community buy-in. Such approaches often require\ncommunity-based data sharing. In the broader networking community (commercial,\nfederal, and academia) anonymized source-to-destination traffic matrices with\nstandard data sharing agreements have emerged as a data product that can meet\nmany of these requirements. This challenge provides an opportunity to highlight\nnovel approaches for optimizing the construction and analysis of anonymized\ntraffic matrices using over 100 billion network packets derived from the\nlargest Internet telescope in the world (CAIDA). This challenge specifies the\nanonymization, construction, and analysis of these traffic matrices. A\nGraphBLAS reference implementation is provided, but the use of GraphBLAS is not\nrequired in this Graph Challenge. As with prior Graph Challenges the goal is to\nprovide a well-defined context for demonstrating innovation. Graph Challenge\nparticipants are free to select (with accompanying explanation) the Graph\nChallenge elements that are appropriate for highlighting their innovations.\n","authors":["Hayden Jananthan","Michael Jones","William Arcand","David Bestor","William Bergeron","Daniel Burrill","Aydin Buluc","Chansup Byun","Timothy Davis","Vijay Gadepally","Daniel Grant","Michael Houle","Matthew Hubbell","Piotr Luszczek","Peter Michaleas","Lauren Milechin","Chasen Milner","Guillermo Morales","Andrew Morris","Julie Mullen","Ritesh Patel","Alex Pentland","Sandeep Pisharody","Andrew Prout","Albert Reuther","Antonio Rosa","Gabriel Wachman","Charles Yee","Jeremy Kepner"],"pdf_url":"https://arxiv.org/pdf/2409.08115v1.pdf","comment":"Accepted to IEEE HPEC 2024"},{"id":"http://arxiv.org/abs/2409.08108v1","updated":"2024-09-12T15:00:58Z","published":"2024-09-12T15:00:58Z","title":"Microarchitectural comparison and in-core modeling of state-of-the-art\n CPUs: Grace, Sapphire Rapids, and Genoa","summary":" With Nvidia's release of the Grace Superchip, all three big semiconductor\ncompanies in HPC (AMD, Intel, Nvidia) are currently competing in the race for\nthe best CPU. In this work we analyze the performance of these state-of-the-art\nCPUs and create an accurate in-core performance model for their\nmicroarchitectures Zen 4, Golden Cove, and Neoverse V2, extending the Open\nSource Architecture Code Analyzer (OSACA) tool and comparing it with LLVM-MCA.\nStarting from the peculiarities and up- and downsides of a single core, we\nextend our comparison by a variety of microbenchmarks and the capabilities of a\nfull node. The \"write-allocate (WA) evasion\" feature, which can automatically\nreduce the memory traffic caused by write misses, receives special attention;\nwe show that the Grace Superchip has a next-to-optimal implementation of WA\nevasion, and that the only way to avoid write allocates on Zen 4 is the\nexplicit use of non-temporal stores.\n","authors":["Jan Laukemann","Georg Hager","Gerhard Wellein"],"pdf_url":"https://arxiv.org/pdf/2409.08108v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.08075v1","updated":"2024-09-12T14:28:54Z","published":"2024-09-12T14:28:54Z","title":"Computational Algorithms for the Product Form Solution of Closed Queuing\n Networks with Finite Buffers and Skip-Over Policy","summary":" Closed queuing networks with finite capacity buffers and skip-over policies\nare fundamental models in the performance evaluation of computer and\ncommunication systems. This technical report presents the details of\ncomputational algorithms to derive the key performance metrics for such\nnetworks. The primary focus is on the efficient computation of the\nnormalization constant, which is critical for determining the steady-state\nprobabilities of the network states under investigation. A convolution\nalgorithm is proposed, which paves the way for the computation of key\nperformance indices, such as queue length distribution and throughput,\naccommodating the intricacies introduced by finite capacity constraints and\nskip-over mechanisms. Finally, an extension of the traditional Mean Value\nAnalysis algorithm addressing numerical stability is provided. The approaches\ndiscussed here allow make the investigation of large-scale networks feasible\nand enable the development of robust implementations of these techniques for\npractical use.\n","authors":["Gianfranco Balbo","Andrea Marin","Diletta Olliaro","Matteo Sereno"],"pdf_url":"https://arxiv.org/pdf/2409.08075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07950v1","updated":"2024-09-12T11:22:04Z","published":"2024-09-12T11:22:04Z","title":"Repr Types: One Abstraction to Rule Them All","summary":" The choice of how to represent an abstract type can have a major impact on\nthe performance of a program, yet mainstream compilers cannot perform\noptimizations at such a high level. When dealing with optimizations of data\ntype representations, an important feature is having extensible\nrepresentation-flexible data types; the ability for a programmer to add new\nabstract types and operations, as well as concrete implementations of these,\nwithout modifying the compiler or a previously defined library. Many research\nprojects support high-level optimizations through static analysis,\ninstrumentation, or benchmarking, but they are all restricted in at least one\naspect of extensibility.\n This paper presents a new approach to representation-flexible data types\nwithout such restrictions and which still finds efficient optimizations. Our\napproach centers around a single built-in type $\\texttt{repr}$ and function\noverloading with cost annotations for operation implementations. We evaluate\nour approach (i) by defining a universal collection type as a library, a single\ntype for all conventional collections, and (ii) by designing and implementing a\nrepresentation-flexible graph library. Programs using $\\texttt{repr}$ types are\ntypically faster than programs with idiomatic representation choices --\nsometimes dramatically so -- as long as the compiler finds good implementations\nfor all operations. Our compiler performs the analysis efficiently by finding\noptimized solutions quickly and by reusing previous results to avoid\nrecomputations.\n","authors":["Viktor Palmkvist","Anders Ågren Thuné","Elias Castegren","David Broman"],"pdf_url":"https://arxiv.org/pdf/2409.07950v1.pdf","comment":"25 pages, 11 figures"},{"id":"http://arxiv.org/abs/2409.08369v1","updated":"2024-09-12T19:30:22Z","published":"2024-09-12T19:30:22Z","title":"E-QUARTIC: Energy Efficient Edge Ensemble of Convolutional Neural\n Networks for Resource-Optimized Learning","summary":" Ensemble learning is a meta-learning approach that combines the predictions\nof multiple learners, demonstrating improved accuracy and robustness.\nNevertheless, ensembling models like Convolutional Neural Networks (CNNs)\nresult in high memory and computing overhead, preventing their deployment in\nembedded systems. These devices are usually equipped with small batteries that\nprovide power supply and might include energy-harvesting modules that extract\nenergy from the environment. In this work, we propose E-QUARTIC, a novel Energy\nEfficient Edge Ensembling framework to build ensembles of CNNs targeting\nArtificial Intelligence (AI)-based embedded systems. Our design outperforms\nsingle-instance CNN baselines and state-of-the-art edge AI solutions, improving\naccuracy and adapting to varying energy conditions while maintaining similar\nmemory requirements. Then, we leverage the multi-CNN structure of the designed\nensemble to implement an energy-aware model selection policy in\nenergy-harvesting AI systems. We show that our solution outperforms the\nstate-of-the-art by reducing system failure rate by up to 40% while ensuring\nhigher average output qualities. Ultimately, we show that the proposed design\nenables concurrent on-device training and high-quality inference execution at\nthe edge, limiting the performance and energy overheads to less than 0.04%.\n","authors":["Le Zhang","Onat Gungor","Flavio Ponzina","Tajana Rosing"],"pdf_url":"https://arxiv.org/pdf/2409.08369v1.pdf","comment":"Accepted by the 30th Asia and South Pacific Design Automation\n Conference (ASP-DAC 2025)"}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2409.08229v1","updated":"2024-09-12T17:16:38Z","published":"2024-09-12T17:16:38Z","title":"Photonic Quantum Computers","summary":" In the pursuit of scalable and fault-tolerant quantum computing\narchitectures, photonic-based quantum computers have emerged as a leading\nfrontier. This article provides a comprehensive overview of advancements in\nphotonic quantum computing, developed by leading industry players, examining\ncurrent performance, architectural designs, and strategies for developing\nlarge-scale, fault-tolerant photonic quantum computers. It also highlights\nrecent groundbreaking experiments that leverage the unique advantages of\nphotonic technologies, underscoring their transformative potential. This review\ncaptures a pivotal moment of photonic quantum computing in the noisy\nintermediate-scale quantum (NISQ) era, offering insights into how photonic\nquantum computers might reshape the future of quantum computing.\n","authors":["M. AbuGhanem"],"pdf_url":"https://arxiv.org/pdf/2409.08229v1.pdf","comment":"47 pages, 16 figures"},{"id":"http://arxiv.org/abs/2409.08141v1","updated":"2024-09-12T15:34:23Z","published":"2024-09-12T15:34:23Z","title":"Rethinking Programmed I/O for Fast Devices, Cheap Cores, and Coherent\n Interconnects","summary":" Conventional wisdom holds that an efficient interface between an OS running\non a CPU and a high-bandwidth I/O device should be based on Direct Memory\nAccess (DMA), descriptor rings, and interrupts: DMA offloads transfers from the\nCPU, descriptor rings provide buffering and queuing, and interrupts facilitate\nasynchronous interaction between cores and device with a lightweight\nnotification mechanism. In this paper we question this wisdom in the light of\nmodern hardware and workloads, particularly in cloud servers. We argue that the\nassumptions that led to this model are obsolete, and in many use-cases use of\nprogrammed I/O, where the CPU explicitly transfers data and control information\nto and from a device via loads and stores, actually results in a more efficient\nsystem. We quantitatively demonstrate these advantages using three use-cases:\nfine-grained RPC-style invocation of functions on an accelerator, offloading of\noperators in a streaming dataflow engine, and a network interface targeting for\nserverless functions. Moreover, we show that while these advantages are\nsignificant over a modern PCIe peripheral bus, a truly cache-coherent\ninterconnect offers significant additional efficiency gains.\n","authors":["Anastasiia Ruzhanskaia","Pengcheng Xu","David Cock","Timothy Roscoe"],"pdf_url":"https://arxiv.org/pdf/2409.08141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07903v1","updated":"2024-09-12T10:14:29Z","published":"2024-09-12T10:14:29Z","title":"Dynamic Simultaneous Multithreaded Arch","summary":" This paper presents the Dynamic Simultaneous Multi-threaded Architecture\n(DSMT). DSMT efficiently exe-cutes multiple threads from a single program on a\nSMT processor core. To accomplish this, threads are generated dynamically from\na predictable flow of control and then executed speculatively. Data obtained\nduring the single context non-speculative execution phase of DSMT is used as a\nhint to speculate the posterior behavior of multiple threads. DSMT employs\nsimple mechanisms based on state bits that keep track of inter-thread\ndependencies in registers and memory, synchronize thread execution, and control\nrecovery from misspeculation. Moreover, DSMT utilizes a novel greedy policy for\nchoosing those sections of code which provide the highest performance based on\ntheir past execution history. The DSMT architecture was simulated with a new\ncycle-accurate, execution-driven simulator. Our simulation results show that\nDSMT has very good potential to improve SMT performance, even when only a\nsingle program is available. However, we found that dynamic thread behavior\ntogether with fre-quent misspeculation may also produce diminishing re-turns in\nperformance. Therefore, the challenge is to max-imize the amount of\nthread-level parallelism that DSMT is capable of exploiting and at the same\ntime reduce the fre-quency of misspeculations.\n","authors":["Daniel Ortiz-Arroyo","Ben Lee"],"pdf_url":"https://arxiv.org/pdf/2409.07903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07847v1","updated":"2024-09-12T08:47:44Z","published":"2024-09-12T08:47:44Z","title":"C3-VQA: Cryogenic Counter-based Co-processor for Variational Quantum\n Algorithms","summary":" Cryogenic quantum computers play a leading role in demonstrating quantum\nadvantage. Given the severe constraints on the cooling capacity in cryogenic\nenvironments, thermal design is crucial for the scalability of these computers.\nThe sources of heat dissipation include passive inflow via inter-temperature\nwires and the power consumption of components located in the cryostat, such as\nwire amplifiers and quantum-classical interfaces. Thus, a critical challenge is\nto reduce the number of wires by reducing the required inter-temperature\nbandwidth while maintaining minimal additional power consumption in the\ncryostat. One solution to address this challenge is near-data processing using\nultra-low-power computational logic within the cryostat. Based on the workload\nanalysis and domain-specific system design focused on Variational Quantum\nAlgorithms (VQAs), we propose the Cryogenic Counter-based Co-processor for VQAs\n(C3-VQA) to enhance the design scalability of cryogenic quantum computers under\nthe thermal constraint. The C3-VQA utilizes single-flux-quantum logic, which is\nan ultra-low-power superconducting digital circuit that operates at the 4 K\nenvironment. The C3-VQA precomputes a part of the expectation value\ncalculations for VQAs and buffers intermediate values using simple bit\noperation units and counters in the cryostat, thereby reducing the required\ninter-temperature bandwidth with small additional power consumption.\nConsequently, the C3-VQA reduces the number of wires, leading to a reduction in\nthe total heat dissipation in the cryostat. Our evaluation shows that the\nC3-VQA reduces the total heat dissipation at the 4 K stage by 30% and 81% under\nsequential-shot and parallel-shot execution scenarios, respectively.\nFurthermore, a case study in quantum chemistry shows that the C3-VQA reduces\ntotal heat dissipation by 87% with a 10,000-qubit system.\n","authors":["Yosuke Ueno","Satoshi Imamura","Yuna Tomida","Teruo Tanimoto","Masamitsu Tanaka","Yutaka Tabuchi","Koji Inoue","Hiroshi Nakamura"],"pdf_url":"https://arxiv.org/pdf/2409.07847v1.pdf","comment":"15 pages, 9 figures, 5 tables. This is an extention of\n arXiv:2403.00363 and arXiv:2310.01630"},{"id":"http://arxiv.org/abs/2409.07832v1","updated":"2024-09-12T08:29:37Z","published":"2024-09-12T08:29:37Z","title":"Efficient and Reliable Vector Similarity Search Using Asymmetric\n Encoding with NAND-Flash for Many-Class Few-Shot Learning","summary":" While memory-augmented neural networks (MANNs) offer an effective solution\nfor few-shot learning (FSL) by integrating deep neural networks with external\nmemory, the capacity requirements and energy overhead of data movement become\nenormous due to the large number of support vectors in many-class FSL\nscenarios. Various in-memory search solutions have emerged to improve the\nenergy efficiency of MANNs. NAND-based multi-bit content addressable memory\n(MCAM) is a promising option due to its high density and large capacity.\nDespite its potential, MCAM faces limitations such as a restricted number of\nword lines, limited quantization levels, and non-ideal effects like varying\nstring currents and bottleneck effects, which lead to significant accuracy\ndrops. To address these issues, we propose several innovative methods. First,\nthe Multi-bit Thermometer Code (MTMC) leverages the extensive capacity of MCAM\nto enhance vector precision using cumulative encoding rules, thereby mitigating\nthe bottleneck effect. Second, the Asymmetric vector similarity search (AVSS)\nreduces the precision of the query vector while maintaining that of the support\nvectors, thereby minimizing the search iterations and improving efficiency in\nmany-class scenarios. Finally, the Hardware-Aware Training (HAT) method\noptimizes controller training by modeling the hardware characteristics of MCAM,\nthus enhancing the reliability of the system. Our integrated framework reduces\nsearch iterations by up to 32 times, and increases overall accuracy by 1.58% to\n6.94%.\n","authors":["Hao-Wei Chiang","Chi-Tse Huang","Hsiang-Yun Cheng","Po-Hao Tseng","Ming-Hsiu Lee"," An-Yeu"," Wu"],"pdf_url":"https://arxiv.org/pdf/2409.07832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11424v1","updated":"2024-09-12T17:53:37Z","published":"2024-09-12T17:53:37Z","title":"LlamaF: An Efficient Llama2 Architecture Accelerator on Embedded FPGAs","summary":" Large language models (LLMs) have demonstrated remarkable abilities in\nnatural language processing. However, their deployment on resource-constrained\nembedded devices remains difficult due to memory and computational demands. In\nthis paper, we present an FPGA-based accelerator designed to improve LLM\ninference performance on embedded FPGAs. We employ post-training quantization\nto reduce model size and optimize for off-chip memory bandwidth. Our design\nfeatures asynchronous computation and a fully pipelined accelerator for\nmatrix-vector multiplication. Experiments of the TinyLlama 1.1B model on a\nXilinx ZCU102 platform show a 14.3-15.8x speedup and a 6.1x power efficiency\nimprovement over running exclusively on ZCU102 processing system (PS).\n","authors":["Han Xu","Yutong Li","Shihao Ji"],"pdf_url":"https://arxiv.org/pdf/2409.11424v1.pdf","comment":null}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2409.08161v1","updated":"2024-09-12T15:54:40Z","published":"2024-09-12T15:54:40Z","title":"A Study on Asynchronous Vote-based Blockchains","summary":" Vote-based blockchains construct a state machine replication (SMR) system\namong participating nodes, using Byzantine Fault Tolerance (BFT) consensus\nprotocols to transition from one state to another. Currently, they rely on\neither synchronous or partially synchronous networks with leader-based\ncoordination or costly Asynchronous Common Subset (ACS) protocols in\nasynchronous settings, making them impractical for large-scale asynchronous\napplications.\n To make Asynchronous SMR scalable, this paper proposes a \\emph{validated\nstrong} BFT consensus model that allows leader-based coordination in\nasynchronous settings. Our BFT consensus model offers the same level of\ntolerance as binary byzantine agreement but does not demand consistency among\nhonest nodes before they vote. An SMR using our model allows nodes to operate\nin different, tentative, but mutually exclusive states until they eventually\nconverge on the same state. We propose an asynchronous BFT protocol for\nvote-based blockchains employing our consensus model to address several\ncritical challenges: how to ensure that nodes eventually converge on the same\nstate across voting rounds, how to assure that a blockchain will steadily\nprogress through epochs while reaching consensus for previous epochs, and how\nto maintain robust byzantine fault tolerance.\n Our protocol greatly reduces message complexity and is the first one to\nachieve linear view changes without relying on threshold signatures. We prove\nthat an asynchronous blockchain built on our protocol can operate with the\n\\emph{same} simplicity and efficiency as partially synchronous blockchains\nbuilt on, e.g. HotStuff-2. This facilitates deploying asynchronous blockchains\nacross large-scale networks.\n","authors":["Yibin Xu","Jianhua Shao","Tijs Slaats","Boris Düdder","Yongluan Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.08161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08108v1","updated":"2024-09-12T15:00:58Z","published":"2024-09-12T15:00:58Z","title":"Microarchitectural comparison and in-core modeling of state-of-the-art\n CPUs: Grace, Sapphire Rapids, and Genoa","summary":" With Nvidia's release of the Grace Superchip, all three big semiconductor\ncompanies in HPC (AMD, Intel, Nvidia) are currently competing in the race for\nthe best CPU. In this work we analyze the performance of these state-of-the-art\nCPUs and create an accurate in-core performance model for their\nmicroarchitectures Zen 4, Golden Cove, and Neoverse V2, extending the Open\nSource Architecture Code Analyzer (OSACA) tool and comparing it with LLVM-MCA.\nStarting from the peculiarities and up- and downsides of a single core, we\nextend our comparison by a variety of microbenchmarks and the capabilities of a\nfull node. The \"write-allocate (WA) evasion\" feature, which can automatically\nreduce the memory traffic caused by write misses, receives special attention;\nwe show that the Grace Superchip has a next-to-optimal implementation of WA\nevasion, and that the only way to avoid write allocates on Zen 4 is the\nexplicit use of non-temporal stores.\n","authors":["Jan Laukemann","Georg Hager","Gerhard Wellein"],"pdf_url":"https://arxiv.org/pdf/2409.08108v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.09536v2","updated":"2024-09-12T13:53:31Z","published":"2024-04-15T07:59:11Z","title":"Noiseless Privacy-Preserving Decentralized Learning","summary":" Decentralized learning (DL) enables collaborative learning without a server\nand without training data leaving the users' devices. However, the models\nshared in DL can still be used to infer training data. Conventional defenses\nsuch as differential privacy and secure aggregation fall short in effectively\nsafeguarding user privacy in DL, either sacrificing model utility or\nefficiency. We introduce Shatter, a novel DL approach in which nodes create\nvirtual nodes (VNs) to disseminate chunks of their full model on their behalf.\nThis enhances privacy by (i) preventing attackers from collecting full models\nfrom other nodes, and (ii) hiding the identity of the original node that\nproduced a given model chunk. We theoretically prove the convergence of Shatter\nand provide a formal analysis demonstrating how Shatter reduces the efficacy of\nattacks compared to when exchanging full models between nodes. We evaluate the\nconvergence and attack resilience of Shatter with existing DL algorithms, with\nheterogeneous datasets, and against three standard privacy attacks. Our\nevaluation shows that Shatter not only renders these privacy attacks infeasible\nwhen each node operates 16 VNs but also exhibits a positive impact on model\nutility compared to standard DL. In summary, Shatter enhances the privacy of DL\nwhile maintaining the utility and efficiency of the model.\n","authors":["Sayan Biswas","Mathieu Even","Anne-Marie Kermarrec","Laurent Massoulie","Rafael Pires","Rishi Sharma","Martijn de Vos"],"pdf_url":"https://arxiv.org/pdf/2404.09536v2.pdf","comment":"Accepted at PETS 2025"},{"id":"http://arxiv.org/abs/2409.07903v1","updated":"2024-09-12T10:14:29Z","published":"2024-09-12T10:14:29Z","title":"Dynamic Simultaneous Multithreaded Arch","summary":" This paper presents the Dynamic Simultaneous Multi-threaded Architecture\n(DSMT). DSMT efficiently exe-cutes multiple threads from a single program on a\nSMT processor core. To accomplish this, threads are generated dynamically from\na predictable flow of control and then executed speculatively. Data obtained\nduring the single context non-speculative execution phase of DSMT is used as a\nhint to speculate the posterior behavior of multiple threads. DSMT employs\nsimple mechanisms based on state bits that keep track of inter-thread\ndependencies in registers and memory, synchronize thread execution, and control\nrecovery from misspeculation. Moreover, DSMT utilizes a novel greedy policy for\nchoosing those sections of code which provide the highest performance based on\ntheir past execution history. The DSMT architecture was simulated with a new\ncycle-accurate, execution-driven simulator. Our simulation results show that\nDSMT has very good potential to improve SMT performance, even when only a\nsingle program is available. However, we found that dynamic thread behavior\ntogether with fre-quent misspeculation may also produce diminishing re-turns in\nperformance. Therefore, the challenge is to max-imize the amount of\nthread-level parallelism that DSMT is capable of exploiting and at the same\ntime reduce the fre-quency of misspeculations.\n","authors":["Daniel Ortiz-Arroyo","Ben Lee"],"pdf_url":"https://arxiv.org/pdf/2409.07903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07734v1","updated":"2024-09-12T03:44:30Z","published":"2024-09-12T03:44:30Z","title":"DFDG: Data-Free Dual-Generator Adversarial Distillation for One-Shot\n Federated Learning","summary":" Federated Learning (FL) is a distributed machine learning scheme in which\nclients jointly participate in the collaborative training of a global model by\nsharing model information rather than their private datasets. In light of\nconcerns associated with communication and privacy, one-shot FL with a single\ncommunication round has emerged as a de facto promising solution. However,\nexisting one-shot FL methods either require public datasets, focus on model\nhomogeneous settings, or distill limited knowledge from local models, making it\ndifficult or even impractical to train a robust global model. To address these\nlimitations, we propose a new data-free dual-generator adversarial distillation\nmethod (namely DFDG) for one-shot FL, which can explore a broader local models'\ntraining space via training dual generators. DFDG is executed in an adversarial\nmanner and comprises two parts: dual-generator training and dual-model\ndistillation. In dual-generator training, we delve into each generator\nconcerning fidelity, transferability and diversity to ensure its utility, and\nadditionally tailor the cross-divergence loss to lessen the overlap of dual\ngenerators' output spaces. In dual-model distillation, the trained dual\ngenerators work together to provide the training data for updates of the global\nmodel. At last, our extensive experiments on various image classification tasks\nshow that DFDG achieves significant performance gains in accuracy compared to\nSOTA baselines.\n","authors":["Kangyang Luo","Shuai Wang","Yexuan Fu","Renrong Shao","Xiang Li","Yunshi Lan","Ming Gao","Jinlong Shu"],"pdf_url":"https://arxiv.org/pdf/2409.07734v1.pdf","comment":"Accepted by ICDM2024 main conference (long paper)"},{"id":"http://arxiv.org/abs/2409.07693v1","updated":"2024-09-12T01:55:08Z","published":"2024-09-12T01:55:08Z","title":"Cooperative Inference with Interleaved Operator Partitioning for CNNs","summary":" Deploying deep learning models on Internet of Things (IoT) devices often\nfaces challenges due to limited memory resources and computing capabilities.\nCooperative inference is an important method for addressing this issue,\nrequiring the partitioning and distributive deployment of an intelligent model.\nTo perform horizontal partitions, existing cooperative inference methods take\neither the output channel of operators or the height and width of feature maps\nas the partition dimensions. In this manner, since the activation of operators\nis distributed, they have to be concatenated together before being fed to the\nnext operator, which incurs the delay for cooperative inference. In this paper,\nwe propose the Interleaved Operator Partitioning (IOP) strategy for CNN models.\nBy partitioning an operator based on the output channel dimension and its\nsuccessive operator based on the input channel dimension, activation\nconcatenation becomes unnecessary, thereby reducing the number of communication\nconnections, which consequently reduces cooperative inference de-lay. Based on\nIOP, we further present a model segmentation algorithm for minimizing\ncooperative inference time, which greedily selects operators for IOP pairing\nbased on the inference delay benefit harvested. Experimental results\ndemonstrate that compared with the state-of-the-art partition approaches used\nin CoEdge, the IOP strategy achieves 6.39% ~ 16.83% faster acceleration and\nreduces peak memory footprint by 21.22% ~ 49.98% for three classical image\nclassification models.\n","authors":["Zhibang Liu","Chaonong Xu","Zhizhuo Liu","Lekai Huang","Jiachen Wei","Chao Li"],"pdf_url":"https://arxiv.org/pdf/2409.07693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08386v1","updated":"2024-09-12T20:32:07Z","published":"2024-09-12T20:32:07Z","title":"Self-Supervised Inference of Agents in Trustless Environments","summary":" In this paper, we propose a novel approach where agents can form swarms to\nproduce high-quality responses effectively. This is accomplished by utilizing\nagents capable of data inference and ranking, which can be effectively\nimplemented using LLMs as response classifiers. We assess existing approaches\nfor trustless agent inference, define our methodology, estimate practical\nparameters, and model various types of malicious agent attacks. Our method\nleverages the collective intelligence of swarms, ensuring robust and efficient\ndecentralized AI inference with better accuracy, security, and reliability. We\nshow that our approach is an order of magnitude faster than other trustless\ninference strategies reaching less than 125 ms validation latency.\n","authors":["Vladyslav Larin","Ivan Nikitin","Alexander Firsov"],"pdf_url":"https://arxiv.org/pdf/2409.08386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08369v1","updated":"2024-09-12T19:30:22Z","published":"2024-09-12T19:30:22Z","title":"E-QUARTIC: Energy Efficient Edge Ensemble of Convolutional Neural\n Networks for Resource-Optimized Learning","summary":" Ensemble learning is a meta-learning approach that combines the predictions\nof multiple learners, demonstrating improved accuracy and robustness.\nNevertheless, ensembling models like Convolutional Neural Networks (CNNs)\nresult in high memory and computing overhead, preventing their deployment in\nembedded systems. These devices are usually equipped with small batteries that\nprovide power supply and might include energy-harvesting modules that extract\nenergy from the environment. In this work, we propose E-QUARTIC, a novel Energy\nEfficient Edge Ensembling framework to build ensembles of CNNs targeting\nArtificial Intelligence (AI)-based embedded systems. Our design outperforms\nsingle-instance CNN baselines and state-of-the-art edge AI solutions, improving\naccuracy and adapting to varying energy conditions while maintaining similar\nmemory requirements. Then, we leverage the multi-CNN structure of the designed\nensemble to implement an energy-aware model selection policy in\nenergy-harvesting AI systems. We show that our solution outperforms the\nstate-of-the-art by reducing system failure rate by up to 40% while ensuring\nhigher average output qualities. Ultimately, we show that the proposed design\nenables concurrent on-device training and high-quality inference execution at\nthe edge, limiting the performance and energy overheads to less than 0.04%.\n","authors":["Le Zhang","Onat Gungor","Flavio Ponzina","Tajana Rosing"],"pdf_url":"https://arxiv.org/pdf/2409.08369v1.pdf","comment":"Accepted by the 30th Asia and South Pacific Design Automation\n Conference (ASP-DAC 2025)"},{"id":"http://arxiv.org/abs/2409.08308v1","updated":"2024-09-12T06:02:44Z","published":"2024-09-12T06:02:44Z","title":"DiReDi: Distillation and Reverse Distillation for AIoT Applications","summary":" Typically, the significant efficiency can be achieved by deploying different\nedge AI models in various real world scenarios while a few large models manage\nthose edge AI models remotely from cloud servers. However, customizing edge AI\nmodels for each user's specific application or extending current models to new\napplication scenarios remains a challenge. Inappropriate local training or fine\ntuning of edge AI models by users can lead to model malfunction, potentially\nresulting in legal issues for the manufacturer. To address aforementioned\nissues, this paper proposes an innovative framework called \"DiReD\", which\ninvolves knowledge DIstillation & REverse DIstillation. In the initial step, an\nedge AI model is trained with presumed data and a KD process using the cloud AI\nmodel in the upper management cloud server. This edge AI model is then\ndispatched to edge AI devices solely for inference in the user's application\nscenario. When the user needs to update the edge AI model to better fit the\nactual scenario, the reverse distillation (RD) process is employed to extract\nthe knowledge: the difference between user preferences and the manufacturer's\npresumptions from the edge AI model using the user's exclusive data. Only the\nextracted knowledge is reported back to the upper management cloud server to\nupdate the cloud AI model, thus protecting user privacy by not using any\nexclusive data. The updated cloud AI can then update the edge AI model with the\nextended knowledge. Simulation results demonstrate that the proposed \"DiReDi\"\nframework allows the manufacturer to update the user model by learning new\nknowledge from the user's actual scenario with private data. The initial\nredundant knowledge is reduced since the retraining emphasizes user private\ndata.\n","authors":["Chen Sun","Qing Tong","Wenshuang Yang","Wenqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.08308v1.pdf","comment":null}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2409.07950v1","updated":"2024-09-12T11:22:04Z","published":"2024-09-12T11:22:04Z","title":"Repr Types: One Abstraction to Rule Them All","summary":" The choice of how to represent an abstract type can have a major impact on\nthe performance of a program, yet mainstream compilers cannot perform\noptimizations at such a high level. When dealing with optimizations of data\ntype representations, an important feature is having extensible\nrepresentation-flexible data types; the ability for a programmer to add new\nabstract types and operations, as well as concrete implementations of these,\nwithout modifying the compiler or a previously defined library. Many research\nprojects support high-level optimizations through static analysis,\ninstrumentation, or benchmarking, but they are all restricted in at least one\naspect of extensibility.\n This paper presents a new approach to representation-flexible data types\nwithout such restrictions and which still finds efficient optimizations. Our\napproach centers around a single built-in type $\\texttt{repr}$ and function\noverloading with cost annotations for operation implementations. We evaluate\nour approach (i) by defining a universal collection type as a library, a single\ntype for all conventional collections, and (ii) by designing and implementing a\nrepresentation-flexible graph library. Programs using $\\texttt{repr}$ types are\ntypically faster than programs with idiomatic representation choices --\nsometimes dramatically so -- as long as the compiler finds good implementations\nfor all operations. Our compiler performs the analysis efficiently by finding\noptimized solutions quickly and by reusing previous results to avoid\nrecomputations.\n","authors":["Viktor Palmkvist","Anders Ågren Thuné","Elias Castegren","David Broman"],"pdf_url":"https://arxiv.org/pdf/2409.07950v1.pdf","comment":"25 pages, 11 figures"},{"id":"http://arxiv.org/abs/2405.17514v3","updated":"2024-09-12T06:11:50Z","published":"2024-05-27T08:31:12Z","title":"AbstractBeam: Enhancing Bottom-Up Program Synthesis using Library\n Learning","summary":" LambdaBeam is a state-of-the-art, execution-guided algorithm for program\nsynthesis that utilizes higher-order functions, lambda functions, and iterative\nloops within a Domain-Specific Language (DSL). LambdaBeam generates each\nprogram from scratch but does not take advantage of the frequent recurrence of\nprogram blocks or subprograms commonly found in specific domains, such as loops\nfor list traversal. To address this limitation, we introduce AbstractBeam: a\nnovel program synthesis framework designed to enhance LambdaBeam by leveraging\nLibrary Learning. AbstractBeam identifies and integrates recurring program\nstructures into the DSL, optimizing the synthesis process. Our experimental\nevaluations demonstrate that AbstractBeam statistically significantly (p <\n0.05) outperforms LambdaBeam in the integer list manipulation domain. Beyond\nsolving more tasks, AbstractBeam's program synthesis is also more efficient,\nrequiring less time and fewer candidate programs to generate a solution.\nFurthermore, our findings indicate that Library Learning effectively enhances\nprogram synthesis in domains that are not explicitly designed to showcase its\nadvantages, thereby highlighting the broader applicability of Library Learning.\n","authors":["Janis Zenkner","Lukas Dierkes","Tobias Sesterhenn","Chrisitan Bartelt"],"pdf_url":"https://arxiv.org/pdf/2405.17514v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09237v5","updated":"2024-09-12T18:03:54Z","published":"2024-08-17T16:06:14Z","title":"QEDCartographer: Automating Formal Verification Using Reward-Free\n Reinforcement Learning","summary":" Formal verification is a promising method for producing reliable software,\nbut the difficulty of manually writing verification proofs severely limits its\nutility in practice. Recent methods have automated some proof synthesis by\nguiding a search through the proof space using a theorem prover. Unfortunately,\nthe theorem prover provides only the crudest estimate of progress, resulting in\neffectively undirected search. To address this problem, we create\nQEDCartographer, an automated proof-synthesis tool that combines supervised and\nreinforcement learning to more effectively explore the proof space.\nQEDCartographer incorporates the proofs' branching structure, enabling\nreward-free search and overcoming the sparse reward problem inherent to formal\nverification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K\ntheorems from 124 open-source Coq projects. QEDCartographer fully automatically\nproves 21.4% of the test-set theorems. Previous search-based proof-synthesis\ntools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on\nsupervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively.\nDiva, which combines 62 tools, proves 19.2%. Comparing to the most effective\nprior tool, Proverbot9001, QEDCartographer produces 34% shorter proofs 29%\nfaster, on average over the theorems both tools prove. Together,\nQEDCartographer and non-learning-based CoqHammer prove 30.3% of the theorems,\nwhile CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement\nlearning is a fruitful research direction for improving proof-synthesis tools'\nsearch mechanisms.\n","authors":["Alex Sanchez-Stern","Abhishek Varghese","Zhanna Kaufman","Dylan Zhang","Talia Ringer","Yuriy Brun"],"pdf_url":"https://arxiv.org/pdf/2408.09237v5.pdf","comment":"Published in the International Conference on Software Engineering\n (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan\n Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal\n Verification Using Reward-Free Reinforcement Learning, in Proceedings of the\n 47th International Conference on Software Engineering (ICSE), 2025"},{"id":"http://arxiv.org/abs/2409.07870v1","updated":"2024-09-12T09:28:30Z","published":"2024-09-12T09:28:30Z","title":"Weaver: A Retargetable Compiler Framework for FPQA Quantum Architectures","summary":" While the prominent quantum computing architectures are based on\nsuperconducting technology, new quantum hardware technologies are emerging,\nsuch as Trapped Ions, Neutral Atoms (or FPQAs), Silicon Spin Qubits, etc. This\ndiverse set of technologies presents fundamental trade-offs in terms of\nscalability, performance, manufacturing, and operating expenses. To manage\nthese diverse quantum technologies, there is a growing need for a retargetable\ncompiler that can efficiently adapt existing code to these emerging hardware\nplatforms. Such a retargetable compiler must be extensible to support new and\nrapidly evolving technologies, performant with fast compilation times and\nhigh-fidelity execution, and verifiable through rigorous equivalence checking\nto ensure the functional equivalence of the retargeted code.\n To this end, we present $Weaver$, the first extensible, performant, and\nverifiable retargetable quantum compiler framework with a focus on FPQAs due to\ntheir unique, promising features. $Weaver$ introduces WQASM, the first formal\nextension of the standard OpenQASM quantum assembly with FPQA-specific\ninstructions to support their distinct capabilities. Next, $Weaver$ implements\nthe WOptimizer, an extensible set of FPQA-specific optimization passes to\nimprove execution quality. Last, the WChecker automatically checks for\nequivalence between the original and the retargeted code. Our evaluation shows\nthat $Weaver$ improves compilation times by $10^3\\times$, execution times by\n$4.4\\times$, and execution fidelity by $10\\%$, on average, compared to\nsuperconducting and state-of-the-art (non-retargetable) FPQA compilers.\n","authors":["Oğuzcan Kırmemiş","Francisco Romão","Emmanouil Giortamis","Pramod Bhatotia"],"pdf_url":"https://arxiv.org/pdf/2409.07870v1.pdf","comment":"11 pages, 12 figures"}],"Operation Systems":[{"id":"http://arxiv.org/abs/2409.08141v1","updated":"2024-09-12T15:34:23Z","published":"2024-09-12T15:34:23Z","title":"Rethinking Programmed I/O for Fast Devices, Cheap Cores, and Coherent\n Interconnects","summary":" Conventional wisdom holds that an efficient interface between an OS running\non a CPU and a high-bandwidth I/O device should be based on Direct Memory\nAccess (DMA), descriptor rings, and interrupts: DMA offloads transfers from the\nCPU, descriptor rings provide buffering and queuing, and interrupts facilitate\nasynchronous interaction between cores and device with a lightweight\nnotification mechanism. In this paper we question this wisdom in the light of\nmodern hardware and workloads, particularly in cloud servers. We argue that the\nassumptions that led to this model are obsolete, and in many use-cases use of\nprogrammed I/O, where the CPU explicitly transfers data and control information\nto and from a device via loads and stores, actually results in a more efficient\nsystem. We quantitatively demonstrate these advantages using three use-cases:\nfine-grained RPC-style invocation of functions on an accelerator, offloading of\noperators in a streaming dataflow engine, and a network interface targeting for\nserverless functions. Moreover, we show that while these advantages are\nsignificant over a modern PCIe peripheral bus, a truly cache-coherent\ninterconnect offers significant additional efficiency gains.\n","authors":["Anastasiia Ruzhanskaia","Pengcheng Xu","David Cock","Timothy Roscoe"],"pdf_url":"https://arxiv.org/pdf/2409.08141v1.pdf","comment":null}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2409.08241v1","updated":"2024-09-12T17:41:19Z","published":"2024-09-12T17:41:19Z","title":"Communication Separations for Truthful Auctions: Breaking the Two-Player\n Barrier","summary":" We study the communication complexity of truthful combinatorial auctions, and\nin particular the case where valuations are either subadditive or\nsingle-minded, which we denote with $\\mathsf{SubAdd}\\cup\\mathsf{SingleM}$. We\nshow that for three bidders with valuations in\n$\\mathsf{SubAdd}\\cup\\mathsf{SingleM}$, any deterministic truthful mechanism\nthat achieves at least a $0.366$-approximation requires $\\exp(m)$\ncommunication. In contrast, a natural extension of [Fei09] yields a\nnon-truthful $\\mathrm{poly}(m)$-communication protocol that achieves a\n$\\frac{1}{2}$-approximation, demonstrating a gap between the power of truthful\nmechanisms and non-truthful protocols for this problem.\n Our approach follows the taxation complexity framework laid out in [Dob16b],\nbut applies this framework in a setting not encompassed by the techniques used\nin past work. In particular, the only successful prior application of this\nframework uses a reduction to simultaneous protocols which only applies for two\nbidders [AKSW20], whereas our three-player lower bounds are stronger than what\ncan possibly arise from a two-player construction (since a trivial truthful\nauction guarantees a $\\frac{1}{2}$-approximation for two players).\n","authors":["Shiri Ron","Clayton Thomas","S. Matthew Weinberg","Qianfan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.08241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08180v1","updated":"2024-09-12T16:10:32Z","published":"2024-09-12T16:10:32Z","title":"Fermionic Gaussian Testing and Non-Gaussian Measures via Convolution","summary":" We explore the properties of fermionic convolution defined by fermionic\nGaussian unitary. A key finding is the purity invariance of pure Gaussian\nstates under this convolution. Leveraging this property, we propose an\nefficient protocol to test the fermionic Gaussianity of pure states by using 3\ncopies of the input states. Furthermore, we introduce a new family of measures\ncalled ``Non-Gaussian Entropy,'' designed to quantify the fermionic\nnon-Gaussianity of states.\n","authors":["Nicholas Lyu","Kaifeng Bu"],"pdf_url":"https://arxiv.org/pdf/2409.08180v1.pdf","comment":"7+24 pages"},{"id":"http://arxiv.org/abs/2409.07996v1","updated":"2024-09-12T12:41:13Z","published":"2024-09-12T12:41:13Z","title":"A SUBSET-SUM Characterisation of the A-Hierarchy","summary":" The A-hierarchy is a parametric analogue of the polynomial hierarchy in the\ncontext of paramterised complexity theory. We give a new characterisation of\nthe A-hierarchy in terms of a generalisation of the SUBSET-SUM problem.\n","authors":["Jan Gutleben","Arne Meier"],"pdf_url":"https://arxiv.org/pdf/2409.07996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.15149v3","updated":"2024-09-12T11:39:17Z","published":"2022-09-30T00:25:04Z","title":"Pure-Circuit: Tight Inapproximability for PPAD","summary":" The current state-of-the-art methods for showing inapproximability in PPAD\narise from the $\\varepsilon$-Generalized-Circuit ($\\varepsilon$-GCircuit)\nproblem. Rubinstein (2018) showed that there exists a small unknown constant\n$\\varepsilon$ for which $\\varepsilon$-GCircuit is PPAD-hard, and subsequent\nwork has shown hardness results for other problems in PPAD by using\n$\\varepsilon$-GCircuit as an intermediate problem.\n We introduce Pure-Circuit, a new intermediate problem for PPAD, which can be\nthought of as $\\varepsilon$-GCircuit pushed to the limit as $\\varepsilon\n\\rightarrow 1$, and we show that the problem is PPAD-complete. We then prove\nthat $\\varepsilon$-GCircuit is PPAD-hard for all $\\varepsilon < 0.1$ by a\nreduction from Pure-Circuit, and thus strengthen all prior work that has used\nGCircuit as an intermediate problem from the existential-constant regime to the\nlarge-constant regime.\n We show that stronger inapproximability results can be derived by reducing\ndirectly from Pure-Circuit. In particular, we prove tight inapproximability\nresults for computing approximate Nash equilibria and approximate\nwell-supported Nash equilibria in graphical games, for finding approximate\nwell-supported Nash equilibria in polymatrix games, and for finding approximate\nequilibria in threshold games.\n","authors":["Argyrios Deligkas","John Fearnley","Alexandros Hollender","Themistoklis Melissourgos"],"pdf_url":"https://arxiv.org/pdf/2209.15149v3.pdf","comment":"This journal version combines the results of two prior conference\n papers: \"Pure-Circuit: Strong Inapproximability for PPAD\" published in FOCS\n 2022, and \"Tight Inapproximability for Graphical Games\" (arXiv:2209.15151)\n published in AAAI 2023"},{"id":"http://arxiv.org/abs/2409.08342v1","updated":"2024-09-12T18:12:40Z","published":"2024-09-12T18:12:40Z","title":"Undecidability and incompleteness in quantum information theory and\n operator algebras","summary":" We survey a number of incompleteness results in operator algebras stemming\nfrom the recent undecidability result in quantum complexity theory known as\n$\\operatorname{MIP}^*=\\operatorname{RE}$, the most prominent of which is the\nG\\\"odelian refutation of the Connes Embedding Problem. We also discuss the very\nrecent use of $\\operatorname{MIP}^*=\\operatorname{RE}$ in refuting the\nAldous-Lyons conjecture in probability theory.\n","authors":["Isaac Goldbring"],"pdf_url":"https://arxiv.org/pdf/2409.08342v1.pdf","comment":"38 pages. To appear in a special issue of Monatshefte f\\\"ur\n Mathematik celebrating the 100th anniversary of G\\\"odel's matriculation at\n the University of Vienna"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2409.08024v1","updated":"2024-09-12T13:07:16Z","published":"2024-09-12T13:07:16Z","title":"Alternating hierarchy of sushifts defined by nondeterministic\n plane-walking automata","summary":" Plane-walking automata were introduced by Salo & T\\\"orma to recognise\nlanguages of two-dimensional infinite words (subshifts), the counterpart of\n$4$-way finite automata for two-dimensional finite words. We extend the model\nto allow for nondeterminism and alternation of quantifiers. We prove that the\nrecognised subshifts form a strict subclass of sofic subshifts, and that the\nclasses corresponding to existential and universal nondeterminism are\nincomparable and both larger that the deterministic class. We define a\nhierarchy of subshifts recognised by plane-walking automata with alternating\nquantifiers, which we conjecture to be strict.\n","authors":["Benjamin Hellouin de Menibus","Pacôme Perrotin"],"pdf_url":"https://arxiv.org/pdf/2409.08024v1.pdf","comment":"14 pages, submitted to STACS 2025"},{"id":"http://arxiv.org/abs/2409.07882v1","updated":"2024-09-12T09:42:53Z","published":"2024-09-12T09:42:53Z","title":"$\\mathbb{N}$-polyregular functions arise from well-quasi-orderings","summary":" A fundamental construction in formal language theory is the Myhill-Nerode\ncongruence on words, whose finitedness characterizes regular language. This\nconstruction was generalized to functions from $\\Sigma^*$ to $\\mathbb{Z}$ by\nColcombet, Dou\\'eneau-Tabot, and Lopez to characterize the class of so-called\n$\\mathbb{Z}$-polyregular functions. In this paper, we relax the notion of\nequivalence relation to quasi-ordering in order to study the class of\n$\\mathbb{N}$-polyregular functions, that plays the role of\n$\\mathbb{Z}$-polyregular functions among functions from $\\Sigma^*$ to\n$\\mathbb{N}$. The analogue of having a finite index is then being a\nwell-quasi-ordering. This provides a canonical object to describe\n$\\mathbb{N}$-polyregular functions, together with a powerful new\ncharacterization of this class.\n","authors":["Aliaume Lopez"],"pdf_url":"https://arxiv.org/pdf/2409.07882v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.02232"},{"id":"http://arxiv.org/abs/2402.12040v2","updated":"2024-09-12T07:25:03Z","published":"2024-02-19T10:55:49Z","title":"Attack Tree Generation via Process Mining","summary":" Attack Trees are a graphical model of security used to study threat\nscenarios. While visually appealing and supported by solid theories and\neffective tools, one of their main drawbacks remains the amount of effort\nrequired by security experts to design them from scratch. This work aims to\nremedy this by providing a method for the automatic generation of Attack Trees\nfrom attack logs. The main original feature of our approach w.r.t existing ones\nis the use of Process Mining algorithms to synthesize Attack Trees, which allow\nusers to customize the way a set of logs are summarized as an Attack Tree, for\nexample by discarding statistically irrelevant events. Our approach is\nsupported by a prototype that, apart from the derivation and translation of the\nmodel, provides the user with an Attack Tree in the RisQFLan format, a tool\nused for quantitative risk modeling and analysis with Attack Trees. We\nillustrate our approach with the case study of attacks on a communication\nprotocol, produced by a state-of-the-art protocol analyzer.\n","authors":["Alyzia-Maria Konsta","Gemma Di Federico","Alberto Lluch Lafuente","Andrea Burattin"],"pdf_url":"https://arxiv.org/pdf/2402.12040v2.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2409.08243v1","updated":"2024-09-12T17:43:27Z","published":"2024-09-12T17:43:27Z","title":"Reasoning Around Paradox with Grounded Deduction","summary":" How can we reason around logical paradoxes without falling into them? This\npaper introduces grounded deduction or GD, a Kripke-inspired approach to\nfirst-order logic and arithmetic that is neither classical nor intuitionistic,\nbut nevertheless appears both pragmatically usable and intuitively justifiable.\nGD permits the direct expression of unrestricted recursive definitions -\nincluding paradoxical ones such as 'L := not L' - while adding dynamic typing\npremises to certain inference rules so that such paradoxes do not lead to\ninconsistency. This paper constitutes a preliminary development and\ninvestigation of grounded deduction, to be extended with further elaboration\nand deeper analysis of its intriguing properties.\n","authors":["Bryan Ford"],"pdf_url":"https://arxiv.org/pdf/2409.08243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01884v3","updated":"2024-09-12T16:55:17Z","published":"2024-01-03T18:53:53Z","title":"A rewriting-logic-with-SMT-based formal analysis and parameter synthesis\n framework for parametric time Petri nets","summary":" This paper presents a concrete and a symbolic rewriting logic semantics for\nparametric time Petri nets with inhibitor arcs (PITPNs), a flexible model of\ntimed systems where parameters are allowed in firing bounds. We prove that our\nsemantics is bisimilar to the \"standard\" semantics of PITPNs. This allows us to\nuse the rewriting logic tool Maude, combined with SMT solving, to provide sound\nand complete formal analyses for PITPNs. We develop and implement a new general\nfolding approach for symbolic reachability, so that Maude-with-SMT reachability\nanalysis terminates whenever the parametric state-class graph of the PITPN is\nfinite. Our work opens up the possibility of using the many formal analysis\ncapabilities of Maude -- including full LTL model checking, analysis with\nuser-defined analysis strategies, and even statistical model checking -- for\nsuch nets. We illustrate this by explaining how almost all formal analysis and\nparameter synthesis methods supported by the state-of-the-art PITPN tool Romeo\ncan be performed using Maude with SMT. In addition, we also support analysis\nand parameter synthesis from parametric initial markings, as well as full LTL\nmodel checking and analysis with user-defined execution strategies. Experiments\nshow that our methods outperform Romeo in many cases.\n","authors":["Jaime Arias","Kyungmin Bae","Carlos Olarte","Peter Csaba Ölveczky","Laure Petrucci"],"pdf_url":"https://arxiv.org/pdf/2401.01884v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.08929"},{"id":"http://arxiv.org/abs/2409.08119v1","updated":"2024-09-12T15:09:07Z","published":"2024-09-12T15:09:07Z","title":"Duality theory in linear optimization and its extensions -- formally\n verified","summary":" Farkas established that a system of linear inequalities has a solution if and\nonly if we cannot obtain a contradiction by taking a linear combination of the\ninequalities. We state and formally prove several Farkas-like theorems over\nlinearly ordered fields in Lean 4. Furthermore, we extend duality theory to the\ncase when some coefficients are allowed to take ``infinite values''.\n","authors":["Martin Dvorak","Vladimir Kolmogorov"],"pdf_url":"https://arxiv.org/pdf/2409.08119v1.pdf","comment":"Code: https://github.com/madvorak/duality/tree/v2.0.0"},{"id":"http://arxiv.org/abs/2409.07996v1","updated":"2024-09-12T12:41:13Z","published":"2024-09-12T12:41:13Z","title":"A SUBSET-SUM Characterisation of the A-Hierarchy","summary":" The A-hierarchy is a parametric analogue of the polynomial hierarchy in the\ncontext of paramterised complexity theory. We give a new characterisation of\nthe A-hierarchy in terms of a generalisation of the SUBSET-SUM problem.\n","authors":["Jan Gutleben","Arne Meier"],"pdf_url":"https://arxiv.org/pdf/2409.07996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.05344v4","updated":"2024-09-12T09:13:11Z","published":"2022-10-11T11:10:40Z","title":"From Proof-theoretic Validity to Base-extension Semantics for\n Intuitionistic Propositional Logic","summary":" Proof-theoretic semantics (P-tS) is the approach to meaning in logic based on\n'proof' (as opposed to 'truth'). There are two major approaches to P-tS:\nproof-theoretic validity (P-tV) and base-extension semantics (B-eS). The former\nis a semantics of arguments, and the latter is a semantics of logical\nconstants. This paper demonstrates that the B-eS for intuitionistic\npropositional logic (IPL) encapsulates the declarative content of a version of\nP-tV based on the elimination rules. This explicates how the B-eS for IPL\nworks, and shows the completeness of this version of P-tV.\n","authors":["Alexander V. Gheorghiu","David J. Pym"],"pdf_url":"https://arxiv.org/pdf/2210.05344v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07741v1","updated":"2024-09-12T04:16:22Z","published":"2024-09-12T04:16:22Z","title":"Handling expression evaluation under interference","summary":" Hoare-style inference rules for program constructs permit the copying of\nexpressions and tests from program text into logical contexts. It is known that\nthis requires care even for sequential programs but further issues arise for\nconcurrent programs because of potential interference to the values of\nvariables. The \"rely-guarantee\" approach does tackle the issue of recording\nacceptable interference and offers a way to provide safe inference rules. This\npaper shows how the algebraic presentation of rely-guarantee ideas can clarify\nand formalise the conditions for safely re-using expressions and tests from\nprogram text in logical contexts for reasoning about programs.\n","authors":["Ian J. Hayes","Cliff B. Jones","Larissa A. Meinicke"],"pdf_url":"https://arxiv.org/pdf/2409.07741v1.pdf","comment":"17 pages, 1 figure"}]},"2024-09-13T00:00:00Z":{"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2409.07903v2","updated":"2024-09-13T08:44:52Z","published":"2024-09-12T10:14:29Z","title":"Dynamic Simultaneous Multithreaded Architecture","summary":" This paper presents the Dynamic Simultaneous Multi-threaded Architecture\n(DSMT). DSMT efficiently exe-cutes multiple threads from a single program on a\nSMT processor core. To accomplish this, threads are generated dynamically from\na predictable flow of control and then executed speculatively. Data obtained\nduring the single context non-speculative execution phase of DSMT is used as a\nhint to speculate the posterior behavior of multiple threads. DSMT employs\nsimple mechanisms based on state bits that keep track of inter-thread\ndependencies in registers and memory, synchronize thread execution, and control\nrecovery from misspeculation. Moreover, DSMT utilizes a novel greedy policy for\nchoosing those sections of code which provide the highest performance based on\ntheir past execution history. The DSMT architecture was simulated with a new\ncycle-accurate, execution-driven simulator. Our simulation results show that\nDSMT has very good potential to improve SMT performance, even when only a\nsingle program is available. However, we found that dynamic thread behavior\ntogether with fre-quent misspeculation may also produce diminishing re-turns in\nperformance. Therefore, the challenge is to max-imize the amount of\nthread-level parallelism that DSMT is capable of exploiting and at the same\ntime reduce the fre-quency of misspeculations.\n","authors":["Daniel Ortiz-Arroyo","Ben Lee"],"pdf_url":"https://arxiv.org/pdf/2409.07903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04104v3","updated":"2024-09-13T02:48:33Z","published":"2024-08-07T21:45:01Z","title":"Hardware-Assisted Virtualization of Neural Processing Units for Cloud\n Platforms","summary":" Cloud platforms today have been deploying hardware accelerators like neural\nprocessing units (NPUs) for powering machine learning (ML) inference services.\nTo maximize the resource utilization while ensuring reasonable quality of\nservice, a natural approach is to virtualize NPUs for efficient resource\nsharing for multi-tenant ML services. However, virtualizing NPUs for modern\ncloud platforms is not easy. This is not only due to the lack of system\nabstraction support for NPU hardware, but also due to the lack of architectural\nand ISA support for enabling fine-grained dynamic operator scheduling for\nvirtualized NPUs.\n We present Neu10, a holistic NPU virtualization framework. We investigate\nvirtualization techniques for NPUs across the entire software and hardware\nstack. Neu10 consists of (1) a flexible NPU abstraction called vNPU, which\nenables fine-grained virtualization of the heterogeneous compute units in a\nphysical NPU (pNPU); (2) a vNPU resource allocator that enables pay-as-you-go\ncomputing model and flexible vNPU-to-pNPU mappings for improved resource\nutilization and cost-effectiveness; (3) an ISA extension of modern NPU\narchitecture for facilitating fine-grained tensor operator scheduling for\nmultiple vNPUs. We implement Neu10 based on a production-level NPU simulator.\nOur experiments show that Neu10 improves the throughput of ML inference\nservices by up to 1.4$\\times$ and reduces the tail latency by up to\n4.6$\\times$, while improving the NPU utilization by 1.2$\\times$ on average,\ncompared to state-of-the-art NPU sharing approaches.\n","authors":["Yuqi Xue","Yiqi Liu","Lifeng Nai","Jian Huang"],"pdf_url":"https://arxiv.org/pdf/2408.04104v3.pdf","comment":"Accepted to MICRO'24"},{"id":"http://arxiv.org/abs/2409.08949v1","updated":"2024-09-13T16:08:08Z","published":"2024-09-13T16:08:08Z","title":"Generic and ML Workloads in an HPC Datacenter: Node Energy, Job\n Failures, and Node-Job Analysis","summary":" HPC datacenters offer a backbone to the modern digital society. Increasingly,\nthey run Machine Learning (ML) jobs next to generic, compute-intensive\nworkloads, supporting science, business, and other decision-making processes.\nHowever, understanding how ML jobs impact the operation of HPC datacenters,\nrelative to generic jobs, remains desirable but understudied. In this work, we\nleverage long-term operational data, collected from a national-scale production\nHPC datacenter, and statistically compare how ML and generic jobs can impact\nthe performance, failures, resource utilization, and energy consumption of HPC\ndatacenters. Our study provides key insights, e.g., ML-related power usage\ncauses GPU nodes to run into temperature limitations, median/mean runtime and\nfailure rates are higher for ML jobs than for generic jobs, both ML and generic\njobs exhibit highly variable arrival processes and resource demands,\nsignificant amounts of energy are spent on unsuccessfully terminating jobs, and\nconcurrent jobs tend to terminate in the same state. We open-source our\ncleaned-up data traces on Zenodo (https://doi.org/10.5281/zenodo.13685426), and\nprovide our analysis toolkit as software hosted on GitHub\n(https://github.com/atlarge-research/2024-icpads-hpc-workload-characterization).\nThis study offers multiple benefits for data center administrators, who can\nimprove operational efficiency, and for researchers, who can further improve\nsystem designs, scheduling techniques, etc.\n","authors":["Xiaoyu Chu","Daniel Hofstätter","Shashikant Ilager","Sacheendra Talluri","Duncan Kampert","Damian Podareanu","Dmitry Duplyakin","Ivona Brandic","Alexandru Iosup"],"pdf_url":"https://arxiv.org/pdf/2409.08949v1.pdf","comment":"10 pages, 10 figures, 6 tables, ICPADS 2024"},{"id":"http://arxiv.org/abs/2409.08595v1","updated":"2024-09-13T07:27:55Z","published":"2024-09-13T07:27:55Z","title":"Automatic Generation of Fast and Accurate Performance Models for Deep\n Neural Network Accelerators","summary":" Implementing Deep Neural Networks (DNNs) on resource-constrained edge devices\nis a challenging task that requires tailored hardware accelerator architectures\nand a clear understanding of their performance characteristics when executing\nthe intended AI workload. To facilitate this, we present an automated\ngeneration approach for fast performance models to accurately estimate the\nlatency of a DNN mapped onto systematically modeled and concisely described\naccelerator architectures. Using our accelerator architecture description\nmethod, we modeled representative DNN accelerators such as Gemmini, UltraTrail,\nPlasticine-derived, and a parameterizable systolic array. Together with DNN\nmappings for those modeled architectures, we perform a combined DNN/hardware\ndependency graph analysis, which enables us, in the best case, to evaluate only\n154 loop kernel iterations to estimate the performance for 4.19 billion\ninstructions achieving a significant speedup. We outperform regression and\nanalytical models in terms of mean absolute percentage error (MAPE) compared to\nsimulation results, while being several magnitudes faster than an RTL\nsimulation.\n","authors":["Konstantin Lübeck","Alexander Louis-Ferdinand Jung","Felix Wedlich","Mika Markus Müller","Federico Nicolás Peccia","Felix Thömmes","Jannik Steinmetz","Valentin Biermaier","Adrian Frischknecht","Paul Palomero Bernardo","Oliver Bringmann"],"pdf_url":"https://arxiv.org/pdf/2409.08595v1.pdf","comment":"Accepted version for: ACM Transactions on Embedded Computing Systems"},{"id":"http://arxiv.org/abs/2409.08534v1","updated":"2024-09-13T04:46:38Z","published":"2024-09-13T04:46:38Z","title":"AnalogGym: An Open and Practical Testing Suite for Analog Circuit\n Synthesis","summary":" Recent advances in machine learning (ML) for automating analog circuit\nsynthesis have been significant, yet challenges remain. A critical gap is the\nlack of a standardized evaluation framework, compounded by various process\ndesign kits (PDKs), simulation tools, and a limited variety of circuit\ntopologies. These factors hinder direct comparisons and the validation of\nalgorithms. To address these shortcomings, we introduced AnalogGym, an\nopen-source testing suite designed to provide fair and comprehensive\nevaluations. AnalogGym includes 30 circuit topologies in five categories:\nsensing front ends, voltage references, low dropout regulators, amplifiers, and\nphase-locked loops. It supports several technology nodes for academic and\ncommercial applications and is compatible with commercial simulators such as\nCadence Spectre, Synopsys HSPICE, and the open-source simulator Ngspice.\nAnalogGym standardizes the assessment of ML algorithms in analog circuit\nsynthesis and promotes reproducibility with its open datasets and detailed\nbenchmark specifications. AnalogGym's user-friendly design allows researchers\nto easily adapt it for robust, transparent comparisons of state-of-the-art\nmethods, while also exposing them to real-world industrial design challenges,\nenhancing the practical relevance of their work. Additionally, we have\nconducted a comprehensive comparison study of various analog sizing methods on\nAnalogGym, highlighting the capabilities and advantages of different\napproaches. AnalogGym is available in the GitHub repository\nhttps://github.com/CODA-Team/AnalogGym. The documentation is also available at\nhttp://coda-team.github.io/AnalogGym/.\n","authors":["Jintao Li","Haochang Zhi","Ruiyu Lyu","Wangzhen Li","Zhaori Bi","Keren Zhu","Yanhan Zeng","Weiwei Shan","Changhao Yan","Fan Yang","Yun Li","Xuan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.08534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09152v1","updated":"2024-09-13T19:14:48Z","published":"2024-09-13T19:14:48Z","title":"Distributed Binary Optimization with In-Memory Computing: An Application\n for the SAT Problem","summary":" In-memory computing (IMC) has been shown to be a promising approach for\nsolving binary optimization problems while significantly reducing energy and\nlatency. Building on the advantages of parallel computation, we propose an\nIMC-compatible parallelism framework inspired by parallel tempering (PT),\nenabling cross-replica communication to improve the performance of IMC solvers.\nThis framework enables an IMC solver not only to improve performance beyond\nwhat can be achieved through parallelization, but also affords greater\nflexibility for the search process with low hardware overhead. We justify that\nthe framework can be applied to almost any IMC solver. We demonstrate the\neffectiveness of the framework for the Boolean satisfiability (SAT) problem,\nusing the WalkSAT heuristic as a proxy for existing IMC solvers. The resulting\nPT-inspired cooperative WalkSAT (PTIC-WalkSAT) algorithm outperforms the\ntraditional WalkSAT heuristic in terms of the iterations-to-solution in 76.3%\nof the tested problem instances and its na\\\"ive parallel variant (PA-WalkSAT)\ndoes so in 68.4% of the instances. An estimate of the energy overhead of the\nPTIC framework for two hardware accelerator architectures indicates that in\nboth cases the overhead of running the PTIC framework would be less than 1% of\nthe total energy required to run each accelerator.\n","authors":["Xiangyi Zhang","Ignacio Rozada","Fabian Böhm","Elisabetta Valiante","Moslem Noori","Thomas Van Vaerenbergh","Chan-Woo Yang","Giacomo Pedretti","Masoud Mohseni","Raymond Beausoleil"],"pdf_url":"https://arxiv.org/pdf/2409.09152v1.pdf","comment":"21 pages, 9 figures"}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2409.07903v2","updated":"2024-09-13T08:44:52Z","published":"2024-09-12T10:14:29Z","title":"Dynamic Simultaneous Multithreaded Architecture","summary":" This paper presents the Dynamic Simultaneous Multi-threaded Architecture\n(DSMT). DSMT efficiently exe-cutes multiple threads from a single program on a\nSMT processor core. To accomplish this, threads are generated dynamically from\na predictable flow of control and then executed speculatively. Data obtained\nduring the single context non-speculative execution phase of DSMT is used as a\nhint to speculate the posterior behavior of multiple threads. DSMT employs\nsimple mechanisms based on state bits that keep track of inter-thread\ndependencies in registers and memory, synchronize thread execution, and control\nrecovery from misspeculation. Moreover, DSMT utilizes a novel greedy policy for\nchoosing those sections of code which provide the highest performance based on\ntheir past execution history. The DSMT architecture was simulated with a new\ncycle-accurate, execution-driven simulator. Our simulation results show that\nDSMT has very good potential to improve SMT performance, even when only a\nsingle program is available. However, we found that dynamic thread behavior\ntogether with fre-quent misspeculation may also produce diminishing re-turns in\nperformance. Therefore, the challenge is to max-imize the amount of\nthread-level parallelism that DSMT is capable of exploiting and at the same\ntime reduce the fre-quency of misspeculations.\n","authors":["Daniel Ortiz-Arroyo","Ben Lee"],"pdf_url":"https://arxiv.org/pdf/2409.07903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03992v2","updated":"2024-09-13T04:54:28Z","published":"2024-09-06T02:44:27Z","title":"Confidential Computing on nVIDIA H100 GPU: A Performance Benchmark Study","summary":" This report evaluates the performance impact of enabling Trusted Execution\nEnvironments (TEE) on nVIDIA H100 GPUs for large language model (LLM) inference\ntasks. We benchmark the overhead introduced by TEE mode across various LLMs and\ntoken lengths, with a particular focus on the bottleneck caused by CPU-GPU data\ntransfers via PCIe. Our results indicate that while there is minimal\ncomputational overhead within the GPU, the overall performance penalty is\nprimarily attributable to data transfer. For the majority of typical LLM\nqueries, the overhead remains below 5%, with larger models and longer sequences\nexperiencing nearly zero overhead.\n","authors":["Jianwei Zhu","Hang Yin","Peng Deng","Shunfan Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.03992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.10856v4","updated":"2024-09-13T17:54:33Z","published":"2021-01-26T15:24:22Z","title":"BE-RAN: Blockchain-enabled Open RAN for 6G with DID and\n Privacy-Preserving Communication","summary":" As 6G networks evolve towards a synergistic system of Communication, Sensing,\nand Computing, Radio Access Networks become more distributed, necessitating\nrobust end-to-end authentication. We propose Blockchain-enabled Radio Access\nNetworks, a novel decentralized RAN architecture enhancing security, privacy,\nand efficiency in authentication processes. BE-RAN leverages distributed ledger\ntechnology to establish trust, offering user-centric identity management,\nenabling mutual authentication, and facilitating on-demand point-to-point\ninter-network elements and UE-UE communication with accountable logging and\nbilling service add-on for public network users, all without relying on\ncentralized authorities. We envision a thoroughly decentralized RAN model and\npropose a privacy-preserving P2P communication approach that complements\nexisting security measures while supporting the CSC paradigm. Results\ndemonstrate BE-RAN significantly reduces communication and computation\noverheads, enhances privacy through decentralized identity management, and\nfacilitates CSC integration, advancing towards more efficient and secure 6G\nnetworks.\n","authors":["Hao Xu","Zihan Zhou","Lei Zhang","Yunqing Sun","Chih-Lin I"],"pdf_url":"https://arxiv.org/pdf/2101.10856v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08949v1","updated":"2024-09-13T16:08:08Z","published":"2024-09-13T16:08:08Z","title":"Generic and ML Workloads in an HPC Datacenter: Node Energy, Job\n Failures, and Node-Job Analysis","summary":" HPC datacenters offer a backbone to the modern digital society. Increasingly,\nthey run Machine Learning (ML) jobs next to generic, compute-intensive\nworkloads, supporting science, business, and other decision-making processes.\nHowever, understanding how ML jobs impact the operation of HPC datacenters,\nrelative to generic jobs, remains desirable but understudied. In this work, we\nleverage long-term operational data, collected from a national-scale production\nHPC datacenter, and statistically compare how ML and generic jobs can impact\nthe performance, failures, resource utilization, and energy consumption of HPC\ndatacenters. Our study provides key insights, e.g., ML-related power usage\ncauses GPU nodes to run into temperature limitations, median/mean runtime and\nfailure rates are higher for ML jobs than for generic jobs, both ML and generic\njobs exhibit highly variable arrival processes and resource demands,\nsignificant amounts of energy are spent on unsuccessfully terminating jobs, and\nconcurrent jobs tend to terminate in the same state. We open-source our\ncleaned-up data traces on Zenodo (https://doi.org/10.5281/zenodo.13685426), and\nprovide our analysis toolkit as software hosted on GitHub\n(https://github.com/atlarge-research/2024-icpads-hpc-workload-characterization).\nThis study offers multiple benefits for data center administrators, who can\nimprove operational efficiency, and for researchers, who can further improve\nsystem designs, scheduling techniques, etc.\n","authors":["Xiaoyu Chu","Daniel Hofstätter","Shashikant Ilager","Sacheendra Talluri","Duncan Kampert","Damian Podareanu","Dmitry Duplyakin","Ivona Brandic","Alexandru Iosup"],"pdf_url":"https://arxiv.org/pdf/2409.08949v1.pdf","comment":"10 pages, 10 figures, 6 tables, ICPADS 2024"},{"id":"http://arxiv.org/abs/2409.08858v1","updated":"2024-09-13T14:20:28Z","published":"2024-09-13T14:20:28Z","title":"Exploring System-Heterogeneous Federated Learning with Dynamic Model\n Selection","summary":" Federated learning is a distributed learning paradigm in which multiple\nmobile clients train a global model while keeping data local. These mobile\nclients can have various available memory and network bandwidth. However, to\nachieve the best global model performance, how we can utilize available memory\nand network bandwidth to the maximum remains an open challenge. In this paper,\nwe propose to assign each client a subset of the global model, having different\nlayers and channels on each layer. To realize that, we design a constrained\nmodel search process with early stop to improve efficiency of finding the\nmodels from such a very large space; and a data-free knowledge distillation\nmechanism to improve the global model performance when aggregating models of\nsuch different structures. For fair and reproducible comparison between\ndifferent solutions, we develop a new system, which can directly allocate\ndifferent memory and bandwidth to each client according to memory and bandwidth\nlogs collected on mobile devices. The evaluation shows that our solution can\nhave accuracy increase ranging from 2.43\\% to 15.81\\% and provide 5\\% to 40\\%\nmore memory and bandwidth utilization with negligible extra running time,\ncomparing to existing state-of-the-art system-heterogeneous federated learning\nmethods under different available memory and bandwidth, non-i.i.d.~datasets,\nimage and text tasks.\n","authors":["Dixi Yao"],"pdf_url":"https://arxiv.org/pdf/2409.08858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08729v1","updated":"2024-09-13T11:32:24Z","published":"2024-09-13T11:32:24Z","title":"Accurate Computation of the Logarithm of Modified Bessel Functions on\n GPUs","summary":" Bessel functions are critical in scientific computing for applications such\nas machine learning, protein structure modeling, and robotics. However,\ncurrently, available routines lack precision or fail for certain input ranges,\nsuch as when the order $v$ is large, and GPU-specific implementations are\nlimited. We address the precision limitations of current numerical\nimplementations while dramatically improving the runtime. We propose two novel\nalgorithms for computing the logarithm of modified Bessel functions of the\nfirst and second kinds by computing intermediate values on a logarithmic scale.\nOur algorithms are robust and never have issues with underflows or overflows\nwhile having relative errors on the order of machine precision, even for inputs\nwhere existing libraries fail. In C++/CUDA, our algorithms have median and\nmaximum speedups of 45x and 6150x for GPU and 17x and 3403x for CPU,\nrespectively, over the ranges of inputs and third-party libraries tested.\nCompared to SciPy, the algorithms have median and maximum speedups of 77x and\n300x for GPU and 35x and 98x for CPU, respectively, over the tested inputs.\n The ability to robustly compute a solution and the low relative errors allow\nus to fit von Mises-Fisher, vMF, distributions to high-dimensional neural\nnetwork features. This is, e.g., relevant for uncertainty quantification in\nmetric learning. We obtain image feature data by processing CIFAR10 training\nimages with the convolutional layers of a pre-trained ResNet50. We successfully\nfit vMF distributions to 2048-, 8192-, and 32768-dimensional image feature data\nusing our algorithms. Our approach provides fast and accurate results while\nexisting implementations in SciPy and mpmath fail to fit successfully.\n Our approach is readily implementable on GPUs, and we provide a fast\nopen-source implementation alongside this paper.\n","authors":["Andreas Plesner","Hans Henrik Brandenborg Sørensen","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2409.08729v1.pdf","comment":"Accepted at ICS 2024"},{"id":"http://arxiv.org/abs/2409.08640v1","updated":"2024-09-13T08:53:10Z","published":"2024-09-13T08:53:10Z","title":"Byzantine-Robust and Communication-Efficient Distributed Learning via\n Compressed Momentum Filtering","summary":" Distributed learning has become the standard approach for training\nlarge-scale machine learning models across private data silos. While\ndistributed learning enhances privacy preservation and training efficiency, it\nfaces critical challenges related to Byzantine robustness and communication\nreduction. Existing Byzantine-robust and communication-efficient methods rely\non full gradient information either at every iteration or at certain iterations\nwith a probability, and they only converge to an unnecessarily large\nneighborhood around the solution. Motivated by these issues, we propose a novel\nByzantine-robust and communication-efficient stochastic distributed learning\nmethod that imposes no requirements on batch size and converges to a smaller\nneighborhood around the optimal solution than all existing methods, aligning\nwith the theoretical lower bound. Our key innovation is leveraging Polyak\nMomentum to mitigate the noise caused by both biased compressors and stochastic\ngradients, thus defending against Byzantine workers under information\ncompression. We provide proof of tight complexity bounds for our algorithm in\nthe context of non-convex smooth loss functions, demonstrating that these\nbounds match the lower bounds in Byzantine-free scenarios. Finally, we validate\nthe practical significance of our algorithm through an extensive series of\nexperiments, benchmarking its performance on both binary classification and\nimage classification tasks.\n","authors":["Changxin Liu","Yanghao Li","Yuhao Yi","Karl H. Johansson"],"pdf_url":"https://arxiv.org/pdf/2409.08640v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.08584v1","updated":"2024-09-13T07:03:01Z","published":"2024-09-13T07:03:01Z","title":"CompressedMediQ: Hybrid Quantum Machine Learning Pipeline for\n High-Dimentional Neuroimaging Data","summary":" This paper introduces CompressedMediQ, a novel hybrid quantum-classical\nmachine learning pipeline specifically developed to address the computational\nchallenges associated with high-dimensional multi-class neuroimaging data\nanalysis. Standard neuroimaging datasets, such as 4D MRI data from the\nAlzheimer's Disease Neuroimaging Initiative (ADNI) and Neuroimaging in\nFrontotemporal Dementia (NIFD), present significant hurdles due to their vast\nsize and complexity. CompressedMediQ integrates classical high-performance\ncomputing (HPC) nodes for advanced MRI pre-processing and Convolutional Neural\nNetwork (CNN)-PCA-based feature extraction and reduction, addressing the\nlimited-qubit availability for quantum data encoding in the NISQ (Noisy\nIntermediate-Scale Quantum) era. This is followed by Quantum Support Vector\nMachine (QSVM) classification. By utilizing quantum kernel methods, the\npipeline optimizes feature mapping and classification, enhancing data\nseparability and outperforming traditional neuroimaging analysis techniques.\nExperimental results highlight the pipeline's superior accuracy in dementia\nstaging, validating the practical use of quantum machine learning in clinical\ndiagnostics. Despite the limitations of NISQ devices, this proof-of-concept\ndemonstrates the transformative potential of quantum-enhanced learning, paving\nthe way for scalable and precise diagnostic tools in healthcare and signal\nprocessing.\n","authors":["Kuan-Cheng Chen","Yi-Tien Li","Tai-Yu Li","Chen-Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2409.08584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09202v1","updated":"2024-09-13T21:31:45Z","published":"2024-09-13T21:31:45Z","title":"WarmSwap: Sharing Dependencies for Accelerating Cold Starts in\n Serverless Functions","summary":" This work presents WarmSwap, a novel provider-side cold-start optimization\nfor serverless computing. This optimization reduces cold-start time when\nbooting and loading dependencies at runtime inside a function container.\nPrevious approaches to the optimization of cold starts tend to fall into two\ncategories: optimizing the infrastructure of serverless computing to benefit\nall serverless functions; or function-specific tuning for individual serverless\nfunctions. In contrast, WarmSwap offers a broad middle ground, which optimizes\nentire categories of serverless functions. WarmSwap eliminates the need to\ninitialize middleware or software dependencies when launching a new serverless\ncontainer, by migrating a pre-initialized live dependency image to the new\nfunction instance. WarmSwap respects the provider's cache constraints, as a\nsingle pre-warmed dependency image in the cache is shared among all serverless\nfunctions requiring that software dependency image. WarmSwap has been tested on\nseven representative functions from FunctionBench. The functions are chosen to\ncompare with previous work. In those tests, WarmSwap accelerates cold-start\nexecutions for those serverless functions with large dependency requirements by\na factor ranging from 1.2 to 2.2.\n","authors":["Rui Li","Devesh Tiwari","Gene Cooperman"],"pdf_url":"https://arxiv.org/pdf/2409.09202v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2011.12431v2","updated":"2024-09-13T23:10:41Z","published":"2020-11-24T22:28:27Z","title":"Proposal of Automatic Offloading Method in Mixed Offloading Destination\n Environment","summary":" When using heterogeneous hardware, barriers of technical skills such as\nOpenMP, CUDA and OpenCL are high. Based on that, I have proposed\nenvironment-adaptive software that enables automatic conversion, configuration.\nHowever, including existing technologies, there has been no research to\nproperly and automatically offload the mixed offloading destination environment\nsuch as GPU, FPGA and many core CPU. In this paper, as a new element of\nenvironment-adaptive software, I study a method for offloading applications\nproperly and automatically in the environment where the offloading destination\nis mixed with GPU, FPGA and many core CPU.\n Y. Yamato, \"Proposal of Automatic Offloading Method in Mixed Offloading\nDestination Environment,\" 2020 Eighth International Symposium on Computing and\nNetworking Workshops (CANDARW 2020), pp.460-464, DOI:\n10.1109/CANDARW51189.2020.00094, Nov. 2020.\n \"(c) 2020 IEEE. Personal use of this material is permitted. Permission from\nIEEE must be obtained for all other uses, in any current or future media,\nincluding reprinting/republishing this material for advertising or promotional\npurposes, creating new collective works, for resale or redistribution to\nservers or lists, or reuse of any copyrighted component of this work in other\nworks.\"\n","authors":["Yoji Yamato"],"pdf_url":"https://arxiv.org/pdf/2011.12431v2.pdf","comment":"5 pages, 3 figures"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2408.13745v3","updated":"2024-09-13T05:50:11Z","published":"2024-08-25T07:10:36Z","title":"DOCE: Finding the Sweet Spot for Execution-Based Code Generation","summary":" Recently, a diverse set of decoding and reranking procedures have been shown\neffective for LLM-based code generation. However, a comprehensive framework\nthat links and experimentally compares these methods is missing. We address\nthis by proposing Decoding Objectives for Code Execution, a comprehensive\nframework that includes candidate generation, $n$-best reranking, minimum Bayes\nrisk (MBR) decoding, and self-debugging as the core components. We then study\nthe contributions of these components through execution-based evaluation\nmetrics. Our findings highlight the importance of execution-based methods and\nthe difference gap between execution-based and execution-free methods.\nFurthermore, we assess the impact of filtering based on trial unit tests, a\nsimple and effective strategy that has been often overlooked in prior works. We\nalso propose self-debugging on multiple candidates, obtaining state-of-the-art\nperformance on reranking for code generation. We expect our framework to\nprovide a solid guideline for future research on code generation.\n","authors":["Haau-Sing Li","Patrick Fernandes","Iryna Gurevych","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2408.13745v3.pdf","comment":"10 pages (32 including appendix), 5 figures, 25 tables. Prompts are\n provided in the GitHub repository to avoid potential text overlap with other\n papers"},{"id":"http://arxiv.org/abs/2408.14515v2","updated":"2024-09-13T04:25:37Z","published":"2024-08-25T11:33:52Z","title":"A Joint Learning Model with Variational Interaction for Multilingual\n Program Translation","summary":" Programs implemented in various programming languages form the foundation of\nsoftware applications. To alleviate the burden of program migration and\nfacilitate the development of software systems, automated program translation\nacross languages has garnered significant attention. Previous approaches\nprimarily focus on pairwise translation paradigms, learning translation between\npairs of languages using bilingual parallel data. However, parallel data is\ndifficult to collect for some language pairs, and the distribution of program\nsemantics across languages can shift, posing challenges for pairwise program\ntranslation. In this paper, we argue that jointly learning a unified model to\ntranslate code across multiple programming languages is superior to separately\nlearning from bilingual parallel data. We propose Variational Interaction for\nMultilingual Program Translation~(VIM-PT), a disentanglement-based generative\napproach that jointly trains a unified model for multilingual program\ntranslation across multiple languages. VIM-PT disentangles code into\nlanguage-shared and language-specific features, using variational inference and\ninteraction information with a novel lower bound, then achieves program\ntranslation through conditional generation. VIM-PT demonstrates four\nadvantages: 1) captures language-shared information more accurately from\nvarious implementations and improves the quality of multilingual program\ntranslation, 2) mines and leverages the capability of non-parallel data, 3)\naddresses the distribution shift of program semantics across languages, 4) and\nserves as a unified model, reducing deployment complexity.\n","authors":["Yali Du","Hui Sun","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2408.14515v2.pdf","comment":"Accepted by the 39th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2024)"},{"id":"http://arxiv.org/abs/2403.16218v2","updated":"2024-09-13T15:45:10Z","published":"2024-03-24T16:18:27Z","title":"CoverUp: Coverage-Guided LLM-Based Test Generation","summary":" Testing is an essential part of software development. Test generation tools\nattempt to automate the otherwise labor-intensive task of test creation, but\ngenerating high-coverage tests remains a challenge. This paper proposes\nCoverUp, a novel approach to driving the generation of high-coverage Python\nregression tests. CoverUp iteratively improves test coverage, interleaving\ncoverage analysis with dialogs with the LLM that steer it to refine tests so\nthat they increase coverage of lines and branches. We evaluate our prototype\nCoverUp implementation across a benchmark of challenging code derived from\nopen-source Python projects, and show that CoverUp substantially improves on\nthe state of the art. Compared to CodaMosa, a hybrid search/LLM-based test\ngenerator, CoverUp achieves a per-module median line+branch coverage of 80%\n(vs. 47%). Compared to MuTAP, a mutation/LLM-based test generator, CoverUp\nachieves an overall line+branch coverage of 90% (vs. 77%). We show that\nCoverUp's iterative, coverage-guided approach is crucial to its effectiveness,\ncontributing to nearly 40% of its successes.\n","authors":["Juan Altmayer Pizzorno","Emery D. Berger"],"pdf_url":"https://arxiv.org/pdf/2403.16218v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2404.01903v2","updated":"2024-09-13T14:56:46Z","published":"2024-04-02T12:44:44Z","title":"Understanding How CodeLLMs (Mis)Predict Types with Activation Steering","summary":" CodeLLMs are transforming software development as we know it. This is\nespecially true for tasks where rule-based approaches fall short, like type\nprediction. The type prediction task consists in adding a new type annotation\nto a partially typed program, such that the resulting program is closer to\nbeing fully typed. The intractability of rule-based approaches and high cost of\nmanual annotation make CodeLLMs an attractive solution to the problem. However,\nCodeLLMs are still far from being deployed on the large-scale due to doubts\nsurrounding their reliability.\n To shed some light on how CodeLLMs approach type prediction, we investigate\nwhat happens when a model mispredicts a type. We show that by applying\nsemantics-preserving edits to code, CodeLLMs are eventually misled into\nmispredicting type annotations. However, by leveraging activation steering we\nare able to \"steer\" the model back to the correct prediction, making models\nmore robust against semantically irrelevant prompt features. We show that\nsteering achieves comparable performance to fine-tuning directly on the type\nprediction task. Furthermore, we find that steering vectors computed from\nPython code are effective at correcting TypeScript mispredictions, and vice\nversa. To our knowledge, this is the first evidence of its kind to suggest that\nCodeLLMs learn task representations that transfer across languages.\n","authors":["Francesca Lucchetti","Arjun Guha"],"pdf_url":"https://arxiv.org/pdf/2404.01903v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2203.08416v2","updated":"2024-09-13T07:29:17Z","published":"2022-03-16T06:21:12Z","title":"On Higher-Order Reachability Games vs May Reachability","summary":" We consider the reachability problem for higher-order functional programs and\nstudy the relationship between reachability games (i.e., the reachability\nproblem for programs with angelic and demonic nondeterminism) and\nmay-reachability (i.e., the reachability problem for programs with only angelic\nnondeterminism). We show that reachability games for order-n programs can be\nreduced to may-reachability for order-(n+1) programs, and vice versa. We\nformalize the reductions by using higher-order fixpoint logic and prove their\ncorrectness. We also discuss applications of the reductions to higher-order\nprogram verification.\n","authors":["Kazuyuki Asada","Hiroyuki Katsura","Naoki Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2203.08416v2.pdf","comment":null}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2409.03992v2","updated":"2024-09-13T04:54:28Z","published":"2024-09-06T02:44:27Z","title":"Confidential Computing on nVIDIA H100 GPU: A Performance Benchmark Study","summary":" This report evaluates the performance impact of enabling Trusted Execution\nEnvironments (TEE) on nVIDIA H100 GPUs for large language model (LLM) inference\ntasks. We benchmark the overhead introduced by TEE mode across various LLMs and\ntoken lengths, with a particular focus on the bottleneck caused by CPU-GPU data\ntransfers via PCIe. Our results indicate that while there is minimal\ncomputational overhead within the GPU, the overall performance penalty is\nprimarily attributable to data transfer. For the majority of typical LLM\nqueries, the overhead remains below 5%, with larger models and longer sequences\nexperiencing nearly zero overhead.\n","authors":["Jianwei Zhu","Hang Yin","Peng Deng","Shunfan Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.03992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08595v1","updated":"2024-09-13T07:27:55Z","published":"2024-09-13T07:27:55Z","title":"Automatic Generation of Fast and Accurate Performance Models for Deep\n Neural Network Accelerators","summary":" Implementing Deep Neural Networks (DNNs) on resource-constrained edge devices\nis a challenging task that requires tailored hardware accelerator architectures\nand a clear understanding of their performance characteristics when executing\nthe intended AI workload. To facilitate this, we present an automated\ngeneration approach for fast performance models to accurately estimate the\nlatency of a DNN mapped onto systematically modeled and concisely described\naccelerator architectures. Using our accelerator architecture description\nmethod, we modeled representative DNN accelerators such as Gemmini, UltraTrail,\nPlasticine-derived, and a parameterizable systolic array. Together with DNN\nmappings for those modeled architectures, we perform a combined DNN/hardware\ndependency graph analysis, which enables us, in the best case, to evaluate only\n154 loop kernel iterations to estimate the performance for 4.19 billion\ninstructions achieving a significant speedup. We outperform regression and\nanalytical models in terms of mean absolute percentage error (MAPE) compared to\nsimulation results, while being several magnitudes faster than an RTL\nsimulation.\n","authors":["Konstantin Lübeck","Alexander Louis-Ferdinand Jung","Felix Wedlich","Mika Markus Müller","Federico Nicolás Peccia","Felix Thömmes","Jannik Steinmetz","Valentin Biermaier","Adrian Frischknecht","Paul Palomero Bernardo","Oliver Bringmann"],"pdf_url":"https://arxiv.org/pdf/2409.08595v1.pdf","comment":"Accepted version for: ACM Transactions on Embedded Computing Systems"}],"Operation Systems":[{"id":"http://arxiv.org/abs/2408.04104v3","updated":"2024-09-13T02:48:33Z","published":"2024-08-07T21:45:01Z","title":"Hardware-Assisted Virtualization of Neural Processing Units for Cloud\n Platforms","summary":" Cloud platforms today have been deploying hardware accelerators like neural\nprocessing units (NPUs) for powering machine learning (ML) inference services.\nTo maximize the resource utilization while ensuring reasonable quality of\nservice, a natural approach is to virtualize NPUs for efficient resource\nsharing for multi-tenant ML services. However, virtualizing NPUs for modern\ncloud platforms is not easy. This is not only due to the lack of system\nabstraction support for NPU hardware, but also due to the lack of architectural\nand ISA support for enabling fine-grained dynamic operator scheduling for\nvirtualized NPUs.\n We present Neu10, a holistic NPU virtualization framework. We investigate\nvirtualization techniques for NPUs across the entire software and hardware\nstack. Neu10 consists of (1) a flexible NPU abstraction called vNPU, which\nenables fine-grained virtualization of the heterogeneous compute units in a\nphysical NPU (pNPU); (2) a vNPU resource allocator that enables pay-as-you-go\ncomputing model and flexible vNPU-to-pNPU mappings for improved resource\nutilization and cost-effectiveness; (3) an ISA extension of modern NPU\narchitecture for facilitating fine-grained tensor operator scheduling for\nmultiple vNPUs. We implement Neu10 based on a production-level NPU simulator.\nOur experiments show that Neu10 improves the throughput of ML inference\nservices by up to 1.4$\\times$ and reduces the tail latency by up to\n4.6$\\times$, while improving the NPU utilization by 1.2$\\times$ on average,\ncompared to state-of-the-art NPU sharing approaches.\n","authors":["Yuqi Xue","Yiqi Liu","Lifeng Nai","Jian Huang"],"pdf_url":"https://arxiv.org/pdf/2408.04104v3.pdf","comment":"Accepted to MICRO'24"}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2409.07201v2","updated":"2024-09-13T11:43:30Z","published":"2024-09-11T11:46:07Z","title":"Improved Hardness Results of the Cardinality-Based Minimum s-t Cut\n Problem in Hypergraphs","summary":" In hypergraphs an edge that crosses a cut can be split in several ways,\ndepending on how many nodes are placed on each side of the cut. A\ncardinality-based splitting function assigns a nonnegative cost of $w_i$ for\neach cut hyperedge $e$ with exactly $i$ nodes on the side of the cut that\ncontains the minority of nodes from $e$. The cardinality-based minimum $s$-$t$\ncut aims to find an $s$-$t$ cut with minimum total cost. Assuming the costs\n$w_i$ are polynomially bounded by the input size and $w_0=0$ and $w_1=1$, we\nshow that the problem becomes NP-hard outside the submodular region found by\nVeldt et al. Our result also holds for $k$-uniform hypergraphs with $k \\geq 4$.\nSpecifically for $4$-uniform hypergraphs we show that the problem is NP-hard\nfor all $w_2>2$, and additionally prove that the \\textsc{No-Even-Split} problem\nis NP-hard.\n","authors":["Florian Adriaens","Iiro Kumpulainen","Nikolaj Tatti"],"pdf_url":"https://arxiv.org/pdf/2409.07201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07035v2","updated":"2024-09-13T11:40:12Z","published":"2024-09-11T06:06:51Z","title":"Approximately counting maximal independent set is equivalent to #SAT","summary":" A maximal independent set is an independent set that is not a subset of any\nother independent set. It is also the key problem of mathematics, computer\nscience, and other fields. A counting problem is a type of computational\nproblem that associated with the number of solutions. Besides, counting\nproblems help us better understand several fields such as algorithm analysis,\ncomplexity theory, artificial intelligence, etc. The problem of counting\nmaximal independent sets is #P-complete. So it is natural to think about\napproximate counting for maximal independent sets problem. In this article, we\nstudy the complexity of approximately counting maximal independent sets.\nSpecifically, we are the first to prove that the #MIS problem is\nAP-interreducible with the #SAT of a given general graph.\n","authors":["Hao Zhang","Tonghua Su"],"pdf_url":"https://arxiv.org/pdf/2409.07035v2.pdf","comment":"After discussion, this is already known in JCSS (with the\n arXiv:1411.6829),proving that approximately counting MIS in bipartite graphs\n is equivalent to #SAT under AP-reductions, it is a stronger result if it\n restricts to bipartite graphs, which implies it for general graphs.\n Therefore, this paper tends to be more of a direct proof exercise"},{"id":"http://arxiv.org/abs/2409.08883v1","updated":"2024-09-13T14:48:54Z","published":"2024-09-13T14:48:54Z","title":"Vertex identification to a forest","summary":" Let $\\mathcal{H}$ be a graph class and $k\\in\\mathbb{N}$. We say a graph $G$\nadmits a \\emph{$k$-identification to $\\mathcal{H}$} if there is a partition\n$\\mathcal{P}$ of some set $X\\subseteq V(G)$ of size at most $k$ such that after\nidentifying each part in $\\mathcal{P}$ to a single vertex, the resulting graph\nbelongs to $\\mathcal{H}$. The graph parameter ${\\sf id}_{\\mathcal{H}}$ is\ndefined so that ${\\sf id}_{\\mathcal{H}}(G)$ is the minimum $k$ such that $G$\nadmits a $k$-identification to $\\mathcal{H}$, and the problem of\n\\textsc{Identification to $\\mathcal{H}$} asks, given a graph $G$ and\n$k\\in\\mathbb{N}$, whether ${\\sf id}_{\\mathcal{H}}(G)\\le k$. If we set\n$\\mathcal{H}$ to be the class $\\mathcal{F}$ of acyclic graphs, we generate the\nproblem \\textsc{Identification to Forest}, which we show to be {\\sf\nNP}-complete. We prove that, when parameterized by the size $k$ of the\nidentification set, it admits a kernel of size $2k+1$. For our kernel we reveal\na close relation of \\textsc{Identification to Forest} with the \\textsc{Vertex\nCover} problem. We also study the combinatorics of the \\textsf{yes}-instances\nof \\textsc{Identification to $\\mathcal{H}$}, i.e., the class\n$\\mathcal{H}^{(k)}:=\\{G\\mid {\\sf id}_{\\mathcal{H}}(G)\\le k\\}$, {which we show\nto be minor-closed for every $k$} when $\\mathcal{H}$ is minor-closed. We prove\nthat the minor-obstructions of $\\mathcal{F}^{(k)}$ are of size at most $2k+4$.\nWe also prove that every graph $G$ such that ${\\sf id}_{\\mathcal{F}}(G)$ is\nsufficiently big contains as a minor either a cycle on $k$ vertices, or $k$\ndisjoint triangles, or the \\emph{$k$-marguerite} graph, that is the graph\nobtained by $k$ disjoint triangles by identifying one vertex of each of them\ninto the same vertex.\n","authors":["Laure Morelle","Ignasi Sau","Dimitrios M. Thilikos"],"pdf_url":"https://arxiv.org/pdf/2409.08883v1.pdf","comment":"18 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.08762v1","updated":"2024-09-13T12:11:24Z","published":"2024-09-13T12:11:24Z","title":"Rice-like complexity lower bounds for Boolean and uniform automata\n networks","summary":" Automata networks are a versatile model of finite discrete dynamical systems\ncomposed of interacting entities (the automata), able to embed any directed\ngraph as a dynamics on its space of configurations (the set of vertices,\nrepresenting all the assignments of a state to each entity). In this world,\nvirtually any question is decidable by a simple exhaustive search. We lever the\nRice-like complexity lower bound, stating that any non-trivial monadic second\norder logic question on the graph of its dynamics is NP-hard or coNP-hard\n(given the automata network description), to bounded alphabets (including the\nBoolean case). This restriction is particularly meaningful for applications to\n\"complex systems\", where each entity has a restricted set of possible states\n(its alphabet). For the non-deterministic case, trivial questions are solvable\nin constant time, hence there is a sharp gap in complexity for the algorithmic\nsolving of concrete problems on them. For the non-deterministic case,\nnon-triviality is defined at bounded treewidth, which offers a structure to\nestablish metatheorems of complexity lower bounds.\n","authors":["Aliénor Goubault--Larrecq","Kévin Perrot"],"pdf_url":"https://arxiv.org/pdf/2409.08762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08761v1","updated":"2024-09-13T12:09:20Z","published":"2024-09-13T12:09:20Z","title":"Journalists, Emotions, and the Introduction of Generative AI Chatbots: A\n Large-Scale Analysis of Tweets Before and After the Launch of ChatGPT","summary":" As part of a broader look at the impact of generative AI, this study\ninvestigated the emotional responses of journalists to the release of ChatGPT\nat the time of its launch. By analyzing nearly 1 million Tweets from\njournalists at major U.S. news outlets, we tracked changes in emotional tone\nand sentiment before and after the introduction of ChatGPT in November 2022.\nUsing various computational and natural language processing techniques to\nmeasure emotional shifts in response to ChatGPT's release, we found an increase\nin positive emotion and a more favorable tone post-launch, suggesting initial\noptimism toward AI's potential. This research underscores the pivotal role of\njournalists as interpreters of technological innovation and disruption,\nhighlighting how their emotional reactions may shape public narratives around\nemerging technologies. The study contributes to understanding the intersection\nof journalism, emotion, and AI, offering insights into the broader societal\nimpact of generative AI tools.\n","authors":["Seth C. Lewis","David M. Markowitz","Jon Benedik Bunquin"],"pdf_url":"https://arxiv.org/pdf/2409.08761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00440v4","updated":"2024-09-13T08:38:18Z","published":"2023-11-01T11:09:01Z","title":"Maximum $k$- vs. $\\ell$-colourings of graphs","summary":" We present polynomial-time SDP-based algorithms for the following problem:\nFor fixed $k \\leq \\ell$, given a real number $\\epsilon>0$ and a graph $G$ that\nadmits a $k$-colouring with a $\\rho$-fraction of the edges coloured properly,\nit returns an $\\ell$-colouring of $G$ with an $(\\alpha \\rho -\n\\epsilon)$-fraction of the edges coloured properly in polynomial time in $G$\nand $1 / \\epsilon$. Our algorithms are based on the algorithms of Frieze and\nJerrum [Algorithmica'97] and of Karger, Motwani and Sudan [JACM'98].\n When $k$ is fixed and $\\ell$ grows large, our algorithm achieves an\napproximation ratio of $\\alpha = 1 - o(1 / \\ell)$. When $k, \\ell$ are both\nlarge, our algorithm achieves an approximation ratio of $\\alpha = 1 - 1 / \\ell\n+ 2 \\ln \\ell / k \\ell - o(\\ln \\ell / k \\ell) - O(1 / k^2)$; if we fix $d = \\ell\n- k$ and allow $k, \\ell$ to grow large, this is $\\alpha = 1 - 1 / \\ell + 2 \\ln\n\\ell / k \\ell - o(\\ln \\ell / k \\ell)$.\n By extending the results of Khot, Kindler, Mossel and O'Donnell [SICOMP'07]\nto the promise setting, we show that for large $k$ and $\\ell$, assuming Khot's\nUnique Games Conjecture (\\UGC), it is \\NP-hard to achieve an approximation\nratio $\\alpha$ greater than $1 - 1 / \\ell + 2 \\ln \\ell / k \\ell + o(\\ln \\ell /\nk \\ell)$, provided that $\\ell$ is bounded by a function that is\n$o(\\exp(\\sqrt[3]{k}))$. For the case where $d = \\ell - k$ is fixed, this bound\nmatches the performance of our algorithm up to $o(\\ln \\ell / k \\ell)$.\nFurthermore, by extending the results of Guruswami and Sinop [ToC'13] to the\npromise setting, we prove that it is \\NP-hard to achieve an approximation ratio\ngreater than $1 - 1 / \\ell + 8 \\ln \\ell / k \\ell + o(\\ln \\ell / k \\ell)$,\nprovided again that $\\ell$ is bounded as before (but this time without assuming\nthe \\UGC).\n","authors":["Tamio-Vesa Nakajima","Stanislav Živný"],"pdf_url":"https://arxiv.org/pdf/2311.00440v4.pdf","comment":null}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2409.07077v2","updated":"2024-09-13T07:31:06Z","published":"2024-09-11T08:02:06Z","title":"Submonoid Membership in n-dimensional lamplighter groups and S-unit\n equations","summary":" We show that Submonoid Membership is decidable in n-dimensional lamplighter\ngroups $(\\mathbb{Z}/p\\mathbb{Z}) \\wr \\mathbb{Z}^n$ for any prime $p$ and\ninteger $n$. More generally, we show decidability of Submonoid Membership in\nsemidirect products of the form $\\mathcal{Y} \\rtimes \\mathbb{Z}^n$, where\n$\\mathcal{Y}$ is any finitely presented module over the Laurent polynomial ring\n$\\mathbb{F}_p[X_1^{\\pm}, \\ldots, X_n^{\\pm}]$. Combined with a result of Shafrir\n(2024), this gives the first example of a group $G$ and a finite index subgroup\n$\\widetilde{G} \\leq G$, such that Submonoid Membership is decidable in\n$\\widetilde{G}$ but undecidable in $G$.\n To obtain our decidability result, we reduce Submonoid Membership in\n$\\mathcal{Y} \\rtimes \\mathbb{Z}^n$ to solving S-unit equations over\n$\\mathbb{F}_p[X_1^{\\pm}, \\ldots, X_n^{\\pm}]$-modules. We show that the solution\nset of such equations is effectively $p$-automatic, extending a result of\nAdamczewski and Bell (2012). As an intermediate result, we also obtain that the\nsolution set of the Knapsack Problem in $\\mathcal{Y} \\rtimes \\mathbb{Z}^n$ is\neffectively $p$-automatic.\n","authors":["Ruiwen Dong"],"pdf_url":"https://arxiv.org/pdf/2409.07077v2.pdf","comment":"corrected a mistake in Lemma 5.9, modified Lemma 5.8, some other\n minor changes"},{"id":"http://arxiv.org/abs/2207.09201v3","updated":"2024-09-13T15:27:34Z","published":"2022-07-19T11:22:32Z","title":"Subsequences in Bounded Ranges: Matching and Analysis Problems","summary":" In this paper, we consider a variant of the classical algorithmic problem of\nchecking whether a given word $v$ is a subsequence of another word $w$. More\nprecisely, we consider the problem of deciding, given a number $p$ (defining a\nrange-bound) and two words $v$ and $w$, whether there exists a factor\n$w[i:i+p-1]$ (or, in other words, a range of length $p$) of $w$ having $v$ as\nsubsequence (i.\\,e., $v$ occurs as a subsequence in the bounded range\n$w[i:i+p-1]$). We give matching upper and lower quadratic bounds for the time\ncomplexity of this problem. Further, we consider a series of algorithmic\nproblems in this setting, in which, for given integers $k$, $p$ and a word $w$,\nwe analyse the set $p$-Subseq$_{k}(w)$ of all words of length $k$ which occur\nas subsequence of some factor of length $p$ of $w$. Among these, we consider\nthe $k$-universality problem, the $k$-equivalence problem, as well as problems\nrelated to absent subsequences. Surprisingly, unlike the case of the classical\nmodel of subsequences in words where such problems have efficient solutions in\ngeneral, we show that most of these problems become intractable in the new\nsetting when subsequences in bounded ranges are considered. Finally, we provide\nan example of how some of our results can be applied to subsequence matching\nproblems for circular words.\n","authors":["Maria Kosche","Tore Koß","Florin Manea","Viktoriya Pak"],"pdf_url":"https://arxiv.org/pdf/2207.09201v3.pdf","comment":"Extended version of a paper which will appear in the proceedings of\n the 16th International Conference on Reachability Problems, RP 2022"},{"id":"http://arxiv.org/abs/2110.09434v2","updated":"2024-09-13T12:19:03Z","published":"2021-10-18T16:05:12Z","title":"Learning Realtime One-Counter Automata","summary":" We present a new learning algorithm for realtime one-counter automata. Our\nalgorithm uses membership and equivalence queries as in Angluin's L* algorithm,\nas well as counter value queries and partial equivalence queries. In a partial\nequivalence query, we ask the teacher whether the language of a given\nfinite-state automaton coincides with a counter-bounded subset of the target\nlanguage. We evaluate an implementation of our algorithm on a number of random\nbenchmarks and on a use case regarding efficient JSON-stream validation.\n","authors":["Véronique Bruyère","Guillermo A. Pérez","Gaëtan Staquet"],"pdf_url":"https://arxiv.org/pdf/2110.09434v2.pdf","comment":"55 pages, 9 figures, submitted to TACAS 2022"},{"id":"http://arxiv.org/abs/2409.08727v1","updated":"2024-09-13T11:29:05Z","published":"2024-09-13T11:29:05Z","title":"Run supports and initial algebra supports of weighted automata","summary":" We consider weighted automata over words and over trees where the weight\nalgebras are strong bimonoids, i.e., semirings which may lack distributivity.\nIt is well known that, for each such weighted automaton, its run semantics and\nits initial algebra semantics can be different, due to the presence of\nnondeterminism and the absence of distributivity. Here we investigate the\nquestion under which conditions on the strong bimonoid the support of the run\nsemantics equals the support of the initial algebra semantics. We prove a\ncharacterization of this equality in terms of strongly zero-sum-free strong\nbimonoids (for weighted automata over words) and in terms of bi-strongly\nzero-sum-free strong bimonoids (for weighted automata over trees). We also\nconsider shortly the images of the two semantics functions.\n","authors":["Manfred Droste","Heiko Vogler"],"pdf_url":"https://arxiv.org/pdf/2409.08727v1.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2409.08762v1","updated":"2024-09-13T12:11:24Z","published":"2024-09-13T12:11:24Z","title":"Rice-like complexity lower bounds for Boolean and uniform automata\n networks","summary":" Automata networks are a versatile model of finite discrete dynamical systems\ncomposed of interacting entities (the automata), able to embed any directed\ngraph as a dynamics on its space of configurations (the set of vertices,\nrepresenting all the assignments of a state to each entity). In this world,\nvirtually any question is decidable by a simple exhaustive search. We lever the\nRice-like complexity lower bound, stating that any non-trivial monadic second\norder logic question on the graph of its dynamics is NP-hard or coNP-hard\n(given the automata network description), to bounded alphabets (including the\nBoolean case). This restriction is particularly meaningful for applications to\n\"complex systems\", where each entity has a restricted set of possible states\n(its alphabet). For the non-deterministic case, trivial questions are solvable\nin constant time, hence there is a sharp gap in complexity for the algorithmic\nsolving of concrete problems on them. For the non-deterministic case,\nnon-triviality is defined at bounded treewidth, which offers a structure to\nestablish metatheorems of complexity lower bounds.\n","authors":["Aliénor Goubault--Larrecq","Kévin Perrot"],"pdf_url":"https://arxiv.org/pdf/2409.08762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08607v1","updated":"2024-09-13T07:41:40Z","published":"2024-09-13T07:41:40Z","title":"Winning Strategy Templates for Stochastic Parity Games towards\n Permissive and Resilient Control","summary":" Stochastic games play an important role for many purposes such as the control\nof cyber-physical systems (CPS), where the controller and the environment are\nmodeled as players. Conventional algorithms typically solve the game for a\nsingle winning strategy in order to develop a controller. However, in\napplications such as CPS control, permissive controllers are crucial as they\nallow the controlled system to adapt if additional constraints need to be\nimposed and also remain resilient to system changes at runtime. In this work,\nwe generalize the concept of permissive winning strategy templates, introduced\nby Anand et al. at TACAS and CAV 2023 for deterministic games, to encompass\nstochastic games. These templates represent an infinite number of winning\nstrategies and can adapt strategies to system changes efficiently. We focus on\nfive key winning objectives -- safety, reachability, B\\\"uchi, co-B\\\"uchi, and\nparity -- and present algorithms to construct templates for each objective. In\naddition, we propose a novel method to extract a winning strategy from a\ntemplate and provide discussions on template comparison.\n","authors":["Kittiphon Phalakarn","Sasinee Pruekprasert","Ichiro Hasuo"],"pdf_url":"https://arxiv.org/pdf/2409.08607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.08416v2","updated":"2024-09-13T07:29:17Z","published":"2022-03-16T06:21:12Z","title":"On Higher-Order Reachability Games vs May Reachability","summary":" We consider the reachability problem for higher-order functional programs and\nstudy the relationship between reachability games (i.e., the reachability\nproblem for programs with angelic and demonic nondeterminism) and\nmay-reachability (i.e., the reachability problem for programs with only angelic\nnondeterminism). We show that reachability games for order-n programs can be\nreduced to may-reachability for order-(n+1) programs, and vice versa. We\nformalize the reductions by using higher-order fixpoint logic and prove their\ncorrectness. We also discuss applications of the reductions to higher-order\nprogram verification.\n","authors":["Kazuyuki Asada","Hiroyuki Katsura","Naoki Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2203.08416v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02466v2","updated":"2024-09-13T22:57:18Z","published":"2023-10-03T22:26:44Z","title":"Parameterized Model-checking of Discrete-Timed Networks and\n Symmetric-Broadcast Systems","summary":" We study the complexity of the model-checking problem for parameterized\ndiscrete-timed systems with arbitrarily many anonymous and identical processes,\nwith and without a distinguished \"controller\", and communicating via\nsynchronous rendezvous. Our framework extends the seminal work from German and\nSistla on untimed systems by adding discrete-time clocks to processes. For the\ncase without a controller, we show that the systems can be efficiently\nsimulated -- and vice versa -- by systems of untimed processes that communicate\nvia rendezvous and symmetric broadcast, which we call \"RB-systems\". Symmetric\nbroadcast is a novel communication primitive that allows all processes to\nsynchronize at once; however, it does not distinguish between sending and\nreceiving processes. We show that the parameterized model-checking problem for\nsafety specifications is pspace-complete, and for liveness specifications it is\ndecidable in exptime. The latter result is proved using automata theory,\nrational linear programming, and geometric reasoning for solving certain\nreachability questions in a new variant of vector addition systems called\n\"vector rendezvous systems\". We believe these proof techniques are of\nindependent interest and will be useful in solving related problems. For the\ncase with a controller, we show that the parameterized model-checking problems\nfor RB-systems and systems with asymmetric broadcast as a primitive are\ninter-reducible. This allows us to prove that for discrete timed-networks with\na controller the parameterized model-checking problem is undecidable for\nliveness specifications. Our work exploits the intimate connection between\nparameterized discrete-timed systems and systems of processes communicating via\nbroadcast, providing a rare and surprising decidability result for liveness\nproperties of parameterized timed-systems, as well as extend work from untimed\nsystems to timed systems.\n","authors":["Benjamin Aminof","Sasha Rubin","Francesco Spegni","Florian Zuleger"],"pdf_url":"https://arxiv.org/pdf/2310.02466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09171v1","updated":"2024-09-13T20:03:53Z","published":"2024-09-13T20:03:53Z","title":"The Challenges of Effective AGM Belief Contraction","summary":" Despite the significant interest in extending the AGM paradigm of belief\nchange beyond finitary logics, the computational aspects of AGM have remained\nalmost untouched. We investigate the computability of AGM contraction on\nnon-finitary logics, and show an intriguing negative result: there are\ninfinitely many uncomputable AGM contraction functions in such logics.\nDrastically, even if we restrict the theories used to represent epistemic\nstates, in all non-trivial cases, the uncomputability remains. On the positive\nside, we identify an infinite class of computable AGM contraction functions on\nLinear Temporal Logic (LTL). We use B\\\"uchi automata to construct such\nfunctions as well as to represent and reason about LTL knowledge.\n","authors":["Dominik Klumpp","Jandson S. Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2409.09171v1.pdf","comment":"20 pages, 4 figures"}]},"2024-09-14T00:00:00Z":{"Performance Profiling":[{"id":"http://arxiv.org/abs/2408.05100v2","updated":"2024-09-14T11:26:31Z","published":"2024-08-09T14:41:32Z","title":"AI-driven Java Performance Testing: Balancing Result Quality with\n Testing Time","summary":" Performance testing aims at uncovering efficiency issues of software systems.\nIn order to be both effective and practical, the design of a performance test\nmust achieve a reasonable trade-off between result quality and testing time.\nThis becomes particularly challenging in Java context, where the software\nundergoes a warm-up phase of execution, due to just-in-time compilation. During\nthis phase, performance measurements are subject to severe fluctuations, which\nmay adversely affect quality of performance test results. However, these\napproaches often provide suboptimal estimates of the warm-up phase, resulting\nin either insufficient or excessive warm-up iterations, which may degrade\nresult quality or increase testing time. There is still a lack of consensus on\nhow to properly address this problem. Here, we propose and study an AI-based\nframework to dynamically halt warm-up iterations at runtime. Specifically, our\nframework leverages recent advances in AI for Time Series Classification (TSC)\nto predict the end of the warm-up phase during test execution. We conduct\nexperiments by training three different TSC models on half a million of\nmeasurement segments obtained from JMH microbenchmark executions. We find that\nour framework significantly improves the accuracy of the warm-up estimates\nprovided by state-of-practice and state-of-the-art methods. This higher\nestimation accuracy results in a net improvement in either result quality or\ntesting time for up to +35.3% of the microbenchmarks. Our study highlights that\nintegrating AI to dynamically estimate the end of the warm-up phase can enhance\nthe cost-effectiveness of Java performance testing.\n","authors":["Luca Traini","Federico Di Menna","Vittorio Cortellessa"],"pdf_url":"https://arxiv.org/pdf/2408.05100v2.pdf","comment":"Accepted for publication in The 39th IEEE/ACM International\n Conference on Automated Software Engineering (ASE '24)"},{"id":"http://arxiv.org/abs/2405.00790v2","updated":"2024-09-14T18:32:20Z","published":"2024-05-01T18:02:25Z","title":"SCAR: Scheduling Multi-Model AI Workloads on Heterogeneous Multi-Chiplet\n Module Accelerators","summary":" Emerging multi-model workloads with heavy models like recent large language\nmodels significantly increased the compute and memory demands on hardware. To\naddress such increasing demands, designing a scalable hardware architecture\nbecame a key problem. Among recent solutions, the 2.5D silicon interposer\nmulti-chip module (MCM)-based AI accelerator has been actively explored as a\npromising scalable solution due to their significant benefits in the low\nengineering cost and composability. However, previous MCM accelerators are\nbased on homogeneous architectures with fixed dataflow, which encounter major\nchallenges from highly heterogeneous multi-model workloads due to their limited\nworkload adaptivity. Therefore, in this work, we explore the opportunity in the\nheterogeneous dataflow MCM AI accelerators. We identify the scheduling of\nmulti-model workload on heterogeneous dataflow MCM AI accelerator is an\nimportant and challenging problem due to its significance and scale, which\nreaches O(10^56) even for a two-model workload on 6x6 chiplets. We develop a\nset of heuristics to navigate the huge scheduling space and codify them into a\nscheduler, SCAR, with advanced techniques such as inter-chiplet pipelining. Our\nevaluation on ten multi-model workload scenarios for datacenter multitenancy\nand AR/VR use-cases has shown the efficacy of our approach, achieving on\naverage 27.6% and 29.6% less energy-delay product (EDP) for the respective\napplications settings compared to homogeneous baselines.\n","authors":["Mohanad Odema","Luke Chen","Hyoukjun Kwon","Mohammad Abdullah Al Faruque"],"pdf_url":"https://arxiv.org/pdf/2405.00790v2.pdf","comment":"MICRO'24"}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2405.00790v2","updated":"2024-09-14T18:32:20Z","published":"2024-05-01T18:02:25Z","title":"SCAR: Scheduling Multi-Model AI Workloads on Heterogeneous Multi-Chiplet\n Module Accelerators","summary":" Emerging multi-model workloads with heavy models like recent large language\nmodels significantly increased the compute and memory demands on hardware. To\naddress such increasing demands, designing a scalable hardware architecture\nbecame a key problem. Among recent solutions, the 2.5D silicon interposer\nmulti-chip module (MCM)-based AI accelerator has been actively explored as a\npromising scalable solution due to their significant benefits in the low\nengineering cost and composability. However, previous MCM accelerators are\nbased on homogeneous architectures with fixed dataflow, which encounter major\nchallenges from highly heterogeneous multi-model workloads due to their limited\nworkload adaptivity. Therefore, in this work, we explore the opportunity in the\nheterogeneous dataflow MCM AI accelerators. We identify the scheduling of\nmulti-model workload on heterogeneous dataflow MCM AI accelerator is an\nimportant and challenging problem due to its significance and scale, which\nreaches O(10^56) even for a two-model workload on 6x6 chiplets. We develop a\nset of heuristics to navigate the huge scheduling space and codify them into a\nscheduler, SCAR, with advanced techniques such as inter-chiplet pipelining. Our\nevaluation on ten multi-model workload scenarios for datacenter multitenancy\nand AR/VR use-cases has shown the efficacy of our approach, achieving on\naverage 27.6% and 29.6% less energy-delay product (EDP) for the respective\napplications settings compared to homogeneous baselines.\n","authors":["Mohanad Odema","Luke Chen","Hyoukjun Kwon","Mohammad Abdullah Al Faruque"],"pdf_url":"https://arxiv.org/pdf/2405.00790v2.pdf","comment":"MICRO'24"}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2405.00790v2","updated":"2024-09-14T18:32:20Z","published":"2024-05-01T18:02:25Z","title":"SCAR: Scheduling Multi-Model AI Workloads on Heterogeneous Multi-Chiplet\n Module Accelerators","summary":" Emerging multi-model workloads with heavy models like recent large language\nmodels significantly increased the compute and memory demands on hardware. To\naddress such increasing demands, designing a scalable hardware architecture\nbecame a key problem. Among recent solutions, the 2.5D silicon interposer\nmulti-chip module (MCM)-based AI accelerator has been actively explored as a\npromising scalable solution due to their significant benefits in the low\nengineering cost and composability. However, previous MCM accelerators are\nbased on homogeneous architectures with fixed dataflow, which encounter major\nchallenges from highly heterogeneous multi-model workloads due to their limited\nworkload adaptivity. Therefore, in this work, we explore the opportunity in the\nheterogeneous dataflow MCM AI accelerators. We identify the scheduling of\nmulti-model workload on heterogeneous dataflow MCM AI accelerator is an\nimportant and challenging problem due to its significance and scale, which\nreaches O(10^56) even for a two-model workload on 6x6 chiplets. We develop a\nset of heuristics to navigate the huge scheduling space and codify them into a\nscheduler, SCAR, with advanced techniques such as inter-chiplet pipelining. Our\nevaluation on ten multi-model workload scenarios for datacenter multitenancy\nand AR/VR use-cases has shown the efficacy of our approach, achieving on\naverage 27.6% and 29.6% less energy-delay product (EDP) for the respective\napplications settings compared to homogeneous baselines.\n","authors":["Mohanad Odema","Luke Chen","Hyoukjun Kwon","Mohammad Abdullah Al Faruque"],"pdf_url":"https://arxiv.org/pdf/2405.00790v2.pdf","comment":"MICRO'24"},{"id":"http://arxiv.org/abs/2409.09414v1","updated":"2024-09-14T11:06:07Z","published":"2024-09-14T11:06:07Z","title":"Weather Prediction Using CNN-LSTM for Time Series Analysis: A Case Study\n on Delhi Temperature Data","summary":" As global climate change intensifies, accurate weather forecasting is\nincreasingly crucial for sectors such as agriculture, energy management, and\nenvironmental protection. Traditional methods, which rely on physical and\nstatistical models, often struggle with complex, nonlinear, and time-varying\ndata, underscoring the need for more advanced techniques. This study explores a\nhybrid CNN-LSTM model to enhance temperature forecasting accuracy for the Delhi\nregion, using historical meteorological data from 1996 to 2017. We employed\nboth direct and indirect methods, including comprehensive data preprocessing\nand exploratory analysis, to construct and train our model. The CNN component\neffectively extracts spatial features, while the LSTM captures temporal\ndependencies, leading to improved prediction accuracy. Experimental results\nindicate that the CNN-LSTM model significantly outperforms traditional\nforecasting methods in terms of both accuracy and stability, with a mean square\nerror (MSE) of 3.26217 and a root mean square error (RMSE) of 1.80615. The\nhybrid model demonstrates its potential as a robust tool for temperature\nprediction, offering valuable insights for meteorological forecasting and\nrelated fields. Future research should focus on optimizing model architecture,\nexploring additional feature extraction techniques, and addressing challenges\nsuch as overfitting and computational complexity. This approach not only\nadvances temperature forecasting but also provides a foundation for applying\ndeep learning to other time series forecasting tasks.\n","authors":["Bangyu Li","Yang Qian"],"pdf_url":"https://arxiv.org/pdf/2409.09414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09296v1","updated":"2024-09-14T04:13:46Z","published":"2024-09-14T04:13:46Z","title":"Developing an Interactive OpenMP Programming Book with Large Language\n Models","summary":" This paper presents an approach to authoring a textbook titled Interactive\nOpenMP Programming with the assistance of Large Language Models (LLMs). The\nwriting process utilized state-of-the-art LLMs, including Gemini Pro 1.5,\nClaude 3, and ChatGPT-4, to generate the initial structure and outline of the\nbook, as well as the initial content for specific chapters. This content\nincluded detailed descriptions of individual OpenMP constructs and practical\nprogramming examples. The outline and content have then undergone extensive\nmanual revisions to meet our book goals. In this paper, we report our findings\nabout the capabilities and limitations of these LLMs. We address critical\nquestions concerning the necessity of textbook resources and the effectiveness\nof LLMs in creating fundamental and practical programming content. Our findings\nsuggest that while LLMs offer significant advantages in generating textbook\ncontent, they require careful integration with traditional educational\nmethodologies to ensure depth, accuracy, and pedagogical effectiveness. The\nInteractive OpenMP Programming book is developed with the framework of Jupyter\nBook, enabling the execution of code within the book from the web browser,\nproviding instant feedback and a dynamic learning experience that stands in\ncontrast to traditional educational resources. The book represents a\nsignificant step towards modernizing programming education, offering insights\ninto practical strategies for generating the textbook through advanced AI\ntools.\n","authors":["Xinyao Yi","Anjia Wang","Yonghong Yan","Chunhua Liao"],"pdf_url":"https://arxiv.org/pdf/2409.09296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09273v1","updated":"2024-09-14T02:54:31Z","published":"2024-09-14T02:54:31Z","title":"Leveraging Foundation Models for Efficient Federated Learning in\n Resource-restricted Edge Networks","summary":" Recently pre-trained Foundation Models (FMs) have been combined with\nFederated Learning (FL) to improve training of downstream tasks while\npreserving privacy. However, deploying FMs over edge networks with\nresource-constrained Internet of Things (IoT) devices is under-explored. This\npaper proposes a novel framework, namely, Federated Distilling knowledge to\nPrompt (FedD2P), for leveraging the robust representation abilities of a\nvision-language FM without deploying it locally on edge devices. This framework\ndistills the aggregated knowledge of IoT devices to a prompt generator to\nefficiently adapt the frozen FM for downstream tasks. To eliminate the\ndependency on a public dataset, our framework leverages perclass local\nknowledge from IoT devices and linguistic descriptions of classes to train the\nprompt generator. Our experiments on diverse image classification datasets\nCIFAR, OxfordPets, SVHN, EuroSAT, and DTD show that FedD2P outperforms the\nbaselines in terms of model performance.\n","authors":["S. Kawa Atapour","S. Jamal SeyedMohammadi","S. Mohammad Sheikholeslami","Jamshid Abouei","Konstantinos N. Plataniotis","Arash Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2409.09273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09242v1","updated":"2024-09-14T00:46:51Z","published":"2024-09-14T00:46:51Z","title":"A Dynamic Weighting Strategy to Mitigate Worker Node Failure in\n Distributed Deep Learning","summary":" The increasing complexity of deep learning models and the demand for\nprocessing vast amounts of data make the utilization of large-scale distributed\nsystems for efficient training essential. These systems, however, face\nsignificant challenges such as communication overhead, hardware limitations,\nand node failure. This paper investigates various optimization techniques in\ndistributed deep learning, including Elastic Averaging SGD (EASGD) and the\nsecond-order method AdaHessian. We propose a dynamic weighting strategy to\nmitigate the problem of straggler nodes due to failure, enhancing the\nperformance and efficiency of the overall training process. We conduct\nexperiments with different numbers of workers and communication periods to\ndemonstrate improved convergence rates and test performance using our strategy.\n","authors":["Yuesheng Xu","Arielle Carr"],"pdf_url":"https://arxiv.org/pdf/2409.09242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1706.02149v3","updated":"2024-09-14T01:26:45Z","published":"2017-06-06T05:48:11Z","title":"Experiments of posture estimation on vehicles using wearable\n acceleration sensors","summary":" In this paper, we study methods to estimate drivers' posture in vehicles\nusing acceleration data of wearable sensor and conduct a field test. Recently,\nsensor technologies have been progressed. Solutions of safety management to\nanalyze vital data acquired from wearable sensor and judge work status are\nproposed. To prevent huge accidents, demands for safety management of bus and\ntaxi are high. However, acceleration of vehicles is added to wearable sensor in\nvehicles, and there is no guarantee to estimate drivers' posture accurately.\nTherefore, in this paper, we study methods to estimate driving posture using\nacceleration data acquired from T-shirt type wearable sensor hitoe, conduct\nfield tests and implement a sample application.\n Y. Yamato, \"Experiments of Posture Estimation on Vehicles Using Wearable\nAcceleration Sensors,\" The 3rd IEEE International Conference on Big Data\nSecurity on Cloud (BigDataSecurity 2017), pp.14-17, DOI:\n10.1109/BigDataSecurity.2017.8, May 2017.\n \"(c) 2017 IEEE. Personal use of this material is permitted. Permission from\nIEEE must be obtained for all other uses, in any current or future media,\nincluding reprinting/republishing this material for advertising or promotional\npurposes, creating new collective works, for resale or redistribution to\nservers or lists, or reuse of any copyrighted component of this work in other\nworks.\"\n","authors":["Yoji Yamato"],"pdf_url":"https://arxiv.org/pdf/1706.02149v3.pdf","comment":"4 pages, 4 figures, The 3rd IEEE International Conference on Big Data\n Security on Cloud (BigDataSecurity 2017), pp.14-17, Beijing, May 2017"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2409.09271v1","updated":"2024-09-14T02:43:20Z","published":"2024-09-14T02:43:20Z","title":"Python Symbolic Execution with LLM-powered Code Generation","summary":" Symbolic execution is a key technology in software testing, which generates\ntest cases by collecting symbolic path constraints and then solving constraints\nwith SMT solvers. Symbolic execution has been proven helpful in generating\nhigh-coverage test cases, but its limitations, e.g., the difficulties in\nsolving path constraints, prevent it from broader usage in software testing.\nMoreover, symbolic execution has encountered many difficulties when applied to\ndynamically typed languages like Python, because it is extremely challenging to\ntranslate the flexible Python grammar into rigid solvers.\n To overcome the main challenges of applying symbolic execution in Python, we\nproposed an LLM-empowered agent, LLM-Sym, that automatically calls an SMT\nsolver, Z3, to solve execution path constraints. Based on an introductory-level\nsymbolic execution engine, our LLM agent can extend it to supporting programs\nwith complex data type `list'. The core contribution of LLM-Sym is translating\ncomplex Python path constraints into Z3 code. To enable accurate path-to-Z3\ntranslation, we design a multiple-step code generation pipeline including type\ninference, retrieval and self-refine. Our experiments demonstrate that LLM-Sym\nis capable of solving path constraints on Leetcode problems with complicated\ncontrol flows and list data structures, which is impossible for the backbone\nsymbolic execution engine. Our approach paves the way for the combination of\nthe generation ability of LLMs with the reasoning ability of symbolic solvers,\nand opens up new opportunities in LLM-augmented test case generation.\n","authors":["Wenhan Wang","Kaibo Liu","An Ran Chen","Ge Li","Zhi Jin","Gang Huang","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2409.09271v1.pdf","comment":null}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2406.12322v3","updated":"2024-09-14T14:10:07Z","published":"2024-06-18T06:44:41Z","title":"Orbit-blocking words and the average-case complexity of Whitehead's\n problem in the free group of rank 2","summary":" Let F_2 denote the free group of rank 2. Our main technical result of\nindependent interest is: for any element u of F_2, there is g in F_2 such that\nno cyclically reduced image of u under an automorphism of F_2 contains g as a\nsubword. We then address computational complexity of the following version of\nthe Whitehead automorphism problem: given a fixed u in F_2, decide, on an input\nv in F_2 of length n, whether or not v is an automorphic image of u. We show\nthat there is an algorithm that solves this problem and has constant (i.e.,\nindependent of n) average-case complexity.\n","authors":["Lucy Hyde","Siobhan O'Connor","Vladimir Shpilrain"],"pdf_url":"https://arxiv.org/pdf/2406.12322v3.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2211.01990v3","updated":"2024-09-14T03:10:49Z","published":"2022-11-03T16:56:47Z","title":"Hive-type polytopes for quiver multiplicities and the membership problem\n for quiver moment cones","summary":" Let $Q$ be a bipartite quiver with vertex set $Q_0$ such that the number of\narrows between any source vertex and any sink vertex is constant. Let\n$\\beta=(\\beta(x))_{x \\in Q_0}$ be a dimension vector of $Q$ with positive\ninteger coordinates.\n Let $rep(Q, \\beta)$ be the representation space of $\\beta$-dimensional\nrepresentations of $Q$ and $GL(\\beta)$ the base change group acting on $rep(Q,\n\\beta)$ be simultaneous conjugation. Let $K^{\\beta}_{\\underline{\\lambda}}$ be\nthe multiplicity of the irreducible representation of $GL(\\beta)$ of highest\nweight $\\underline{\\lambda}$ in the ring of polynomial functions on $rep(Q,\n\\beta)$.\n We show that $K^{\\beta}_{\\underline{\\lambda}}$ can be expressed as the number\nof lattice points of a polytope obtained by gluing together two Knutson-Tao\nhive polytopes. Furthermore, this polytopal description together with\nDerksen-Weyman's Saturation Theorem for quiver semi-invariants allows us to use\nTardos' algorithm to solve the membership problem for the moment cone\nassociated to $(Q,\\beta)$ in strongly polynomial time.\n","authors":["Calin Chindris","Brett Collins","Daniel Kline"],"pdf_url":"https://arxiv.org/pdf/2211.01990v3.pdf","comment":"v2: Fixed the claim about the generic quiver semi-stability problem\n (see Remarks 2.8 and 5.5); v3: Final version to appear in Algebraic\n Combinatorics. The focus is on polytopal descriptions of multiplicities of\n irreducible representations of $GL(\\beta)$ in the ring of polynomial\n functions on $rep(Q, \\beta)$"},{"id":"http://arxiv.org/abs/2406.12322v3","updated":"2024-09-14T14:10:07Z","published":"2024-06-18T06:44:41Z","title":"Orbit-blocking words and the average-case complexity of Whitehead's\n problem in the free group of rank 2","summary":" Let F_2 denote the free group of rank 2. Our main technical result of\nindependent interest is: for any element u of F_2, there is g in F_2 such that\nno cyclically reduced image of u under an automorphism of F_2 contains g as a\nsubword. We then address computational complexity of the following version of\nthe Whitehead automorphism problem: given a fixed u in F_2, decide, on an input\nv in F_2 of length n, whether or not v is an automorphic image of u. We show\nthat there is an algorithm that solves this problem and has constant (i.e.,\nindependent of n) average-case complexity.\n","authors":["Lucy Hyde","Siobhan O'Connor","Vladimir Shpilrain"],"pdf_url":"https://arxiv.org/pdf/2406.12322v3.pdf","comment":"6 pages. arXiv admin note: text overlap with arXiv:2401.09218"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2310.17295v2","updated":"2024-09-14T18:42:20Z","published":"2023-10-26T10:23:02Z","title":"Normal Forms for Elements of ${}^*$-Continuous Kleene Algebras\n Representing the Context-Free Languages","summary":" Within the tensor product $K \\mathop{\\otimes_{\\cal R}} C_2'$ of any\n${}^*$-continuous Kleene algebra $K$ with the polycyclic ${}^*$-continuous\nKleene algebra $C_2'$ over two bracket pairs there is a copy of the fixed-point\nclosure of $K$: the centralizer of $C_2'$ in $K \\mathop{\\otimes_{\\cal R}}\nC_2'$. Using an automata-theoretic representation of elements of\n$K\\mathop{\\otimes_{\\cal R}} C_2'$ \\`a la Kleene, with the aid of normal form\ntheorems that restrict the occurrences of brackets on paths through the\nautomata, we develop a foundation for a calculus of context-free expressions\nwithout variable binders. We also give some results on the bra-ket\n${}^*$-continuous Kleene algebra $C_2$, motivate the ``completeness equation''\nthat distinguishes $C_2$ from $C_2'$, and show that $C_2'$ already validates a\nrelativized form of this equation.\n","authors":["Mark Hopkins","Hans Leiß"],"pdf_url":"https://arxiv.org/pdf/2310.17295v2.pdf","comment":"Revised version. 43 pages, 4 figures"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2409.09485v1","updated":"2024-09-14T17:15:30Z","published":"2024-09-14T17:15:30Z","title":"Enumerating Minimal Unsatisfiable Cores of LTLf formulas","summary":" Linear Temporal Logic over finite traces ($\\text{LTL}_f$) is a widely used\nformalism with applications in AI, process mining, model checking, and more.\nThe primary reasoning task for $\\text{LTL}_f$ is satisfiability checking; yet,\nthe recent focus on explainable AI has increased interest in analyzing\ninconsistent formulas, making the enumeration of minimal explanations for\ninfeasibility a relevant task also for $\\text{LTL}_f$. This paper introduces a\nnovel technique for enumerating minimal unsatisfiable cores (MUCs) of an\n$\\text{LTL}_f$ specification. The main idea is to encode a $\\text{LTL}_f$\nformula into an Answer Set Programming (ASP) specification, such that the\nminimal unsatisfiable subsets (MUSes) of the ASP program directly correspond to\nthe MUCs of the original $\\text{LTL}_f$ specification. Leveraging recent\nadvancements in ASP solving yields a MUC enumerator achieving good performance\nin experiments conducted on established benchmarks from the literature.\n","authors":["Antonio Ielo","Giuseppe Mazzotta","Rafael Peñaloza","Francesco Ricca"],"pdf_url":"https://arxiv.org/pdf/2409.09485v1.pdf","comment":null}]},"2024-09-16T00:00:00Z":{"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2409.01881v2","updated":"2024-09-16T10:07:30Z","published":"2024-09-03T13:22:38Z","title":"The Impact of Run-Time Variability on Side-Channel Attacks Targeting\n FPGAs","summary":" To defeat side-channel attacks, many recent countermeasures work by enforcing\nrandom run-time variability to the target computing platform in terms of clock\njitters, frequency and voltage scaling, and phase shift, also combining the\ncontributions from different actuators to maximize the side-channel resistance\nof the target. However, the robustness of such solutions seems strongly\ninfluenced by several hyper-parameters for which an in-depth analysis is still\nmissing. This work proposes a fine-grained dynamic voltage and frequency\nscaling actuator to investigate the effectiveness of recent desynchronization\ncountermeasures with the goal of highlighting the link between the enforced\nrun-time variability and the vulnerability to side-channel attacks of\ncryptographic implementations targeting FPGAs. The analysis of the results\ncollected from real hardware allowed for a comprehensive understanding of the\nprotection offered by run-time variability countermeasures against side-channel\nattacks.\n","authors":["Davide Galli","Adriano Guarisco","William Fornaciari","Matteo Matteucci","Davide Zoni"],"pdf_url":"https://arxiv.org/pdf/2409.01881v2.pdf","comment":"Accepted for lecture presentation at 2024 31st IEEE International\n Conference on Electronics, Circuits and Systems (ICECS), Nancy, France, Nov.\n 18-20, 2024"},{"id":"http://arxiv.org/abs/2310.01336v2","updated":"2024-09-16T16:05:55Z","published":"2023-10-02T16:53:00Z","title":"JugglePAC: a Pipelined Accumulation Circuit","summary":" Reducing a set of numbers to a single value is a fundamental operation in\napplications such as signal processing, data compression, scientific computing,\nand neural networks. Accumulation, which involves summing a dataset to obtain a\nsingle result, is crucial for these tasks. Due to hardware constraints, large\nvectors or matrices often cannot be fully stored in memory and must be read\nsequentially, one item per clock cycle. For high-speed inputs, such as rapidly\narriving floating-point numbers, pipelined adders are necessary to maintain\nperformance. However, pipelining introduces multiple intermediate sums and\nrequires delays between back-to-back datasets unless their processing is\noverlapped. In this paper, we present JugglePAC, a novel accumulation circuit\ndesigned to address these challenges. JugglePAC operates quickly, is\narea-efficient, and features a fully pipelined design. It effectively manages\nback-to-back variable-length datasets while consistently producing results in\nthe correct input order. Compared to the state-of-the-art, JugglePAC achieves\nhigher throughput and reduces area complexity, offering significant\nimprovements in performance and efficiency.\n","authors":["Ahmad Houraniah","H. Fatih Ugurdag","Furkan Aydin"],"pdf_url":"https://arxiv.org/pdf/2310.01336v2.pdf","comment":"4 pages, 1 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.10325v1","updated":"2024-09-16T14:36:22Z","published":"2024-09-16T14:36:22Z","title":"PASS: An Asynchronous Probabilistic Processor for Next Generation\n Intelligence","summary":" New computing paradigms are required to solve the most challenging\ncomputational problems where no exact polynomial time solution\nexists.Probabilistic Ising Accelerators has gained promise on these problems\nwith the ability to model complex probability distributions and find ground\nstates of intractable problems. In this context, we have demonstrated the\nParallel Asynchronous Stochastic Sampler (PASS), the first fully on-chip\nintegrated, asynchronous, probabilistic accelerator that takes advantage of the\nintrinsic fine-grained parallelism of the Ising Model and built in state of the\nart 14nm CMOS FinFET technology. We have demonstrated broad applicability of\nthis accelerator on problems ranging from Combinatorial Optimization, Neural\nSimulation, to Machine Learning along with up to $23,000$x energy to solution\nimprovement compared to CPUs on probabilistic problems.\n","authors":["Saavan Patel","Philip Canoza","Adhiraj Datar","Steven Lu","Chirag Garg","Sayeef Salahuddin"],"pdf_url":"https://arxiv.org/pdf/2409.10325v1.pdf","comment":"13 page main text, 5 main figures, 21 pages supplementary and\n methods, 7 supplementary figures, 2 supplementary tables"},{"id":"http://arxiv.org/abs/2409.10136v1","updated":"2024-09-16T10:03:10Z","published":"2024-09-16T10:03:10Z","title":"Count2Multiply: Reliable In-memory High-Radix Counting","summary":" Big data processing has exposed the limits of compute-centric hardware\nacceleration due to the memory-to-processor bandwidth bottleneck. Consequently,\nthere has been a shift towards memory-centric architectures, leveraging\nsubstantial compute parallelism by processing using the memory elements\ndirectly. Computing-in-memory (CIM) proposals for both conventional and\nemerging memory technologies often target massively parallel operations.\nHowever, current CIM solutions face significant challenges. For emerging\ndata-intensive applications, such as advanced machine learning techniques and\nbioinformatics, where matrix multiplication is a key primitive, memristor\ncrossbars suffer from limited write endurance and expensive write operations.\nIn contrast, while DRAM-based solutions have successfully demonstrated\nmultiplication using additions, they remain prohibitively slow. This paper\nintroduces Count2Multiply, a technology-agnostic digital-CIM method for\nperforming integer-binary and integer-integer matrix multiplications using\nhigh-radix, massively parallel counting implemented with bitwise logic\noperations. In addition, Count2Multiply is designed with fault tolerance in\nmind and leverages traditional scalable row-wise error correction codes, such\nas Hamming and BCH codes, to protect against the high error rates of existing\nCIM designs. We demonstrate Count2Multiply with a detailed application to CIM\nin conventional DRAM due to its ubiquity and high endurance. We also explore\nthe acceleration potential of racetrack memories due to their shifting\nproperties, which are natural for Count2Multiply, and their high endurance.\nCompared to the state-of-the-art in-DRAM method, Count2Multiply achieves up to\n10x speedup, 3.8x higher GOPS/Watt, and 1.4x higher GOPS/area, while the RTM\ncounterpart offers gains of 10x, 57x, and 3.8x.\n","authors":["João Paulo Cardoso de Lima","Benjamin Franklin Morris III","Asif Ali Khan","Jeronimo Castrillon","Alex K. Jones"],"pdf_url":"https://arxiv.org/pdf/2409.10136v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2409.09948v1","updated":"2024-09-16T02:40:02Z","published":"2024-09-16T02:40:02Z","title":"Enhancing Industrial Cybersecurity: SoftHSM Implementation on SBCs for\n Mitigating MITM Attacks","summary":" The rapid growth of industrial technology, driven by automation, IoT, and\ncloud computing, has also increased the risk of cyberattacks, such as\nMan-in-the-Middle (MITM) attacks. A standard solution to protect data is using\na Hardware Security Module (HSM), but its high implementation cost has led to\nthe development of a more affordable alternative: SoftHSM. This software-based\nmodule manages encryption and decryption keys using cryptographic algorithms.\nThis study simulates the use of SoftHSM on a single-board computer (SBC) to\nenhance industrial system security and cost-effectively mitigate MITM attacks.\nThe security system integrates AES and RSA cryptographic algorithms, with\nSoftHSM handling RSA key storage. The results show that HSM protects RSA\nprivate keys from extraction attempts, ensuring data security. In terms of\nperformance, the system achieved an average encryption time of 3.29 seconds, a\nslot access time of 0.018 seconds, and a decryption time of 2.558 seconds. It\nalso demonstrated efficient memory usage, with 37.24% for encryption and 24.24%\nfor decryption, while consuming 5.20 V and 0.72 A during processing.\n","authors":["Joshua Tito Amael","Jazi Eko Istiyanto","Oskar Natan"],"pdf_url":"https://arxiv.org/pdf/2409.09948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09928v1","updated":"2024-09-16T02:06:49Z","published":"2024-09-16T02:06:49Z","title":"High-Security Hardware Module with PUF and Hybrid Cryptography for Data\n Security","summary":" This research highlights the rapid development of technology in the industry,\nparticularly Industry 4.0, supported by fundamental technologies such as the\nInternet of Things (IoT), cloud computing, big data, and data analysis. Despite\nproviding efficiency, these developments also bring negative impacts, such as\nincreased cyber-attacks, especially in manufacturing. One standard attack in\nthe industry is the man-in-the-middle (MITM) attack, which can have severe\nconsequences for the physical data transfer, particularly on the integrity of\nsensor and actuator data in industrial machines. This research proposes a\nsolution by developing a hardware security module (HSM) using a\nfield-programmable gate array (FPGA) with physical unclonable function (PUF)\nauthentication and a hybrid encryption data security system. Experimental\nresults show that this research improves some criteria in industrial\ncybersecurity, ensuring critical data security from cyber-attacks in industrial\nmachines.\n","authors":["Joshua Tito Amael","Oskar Natan","Jazi Eko Istiyanto"],"pdf_url":"https://arxiv.org/pdf/2409.09928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11440v1","updated":"2024-09-16T15:18:33Z","published":"2024-09-16T15:18:33Z","title":"MARCA: Mamba Accelerator with ReConfigurable Architecture","summary":" We propose a Mamba accelerator with reconfigurable architecture, MARCA.We\npropose three novel approaches in this paper. (1) Reduction alternative PE\narray architecture for both linear and element-wise operations. For linear\noperations, the reduction tree connected to PE arrays is enabled and executes\nthe reduction operation. For element-wise operations, the reduction tree is\ndisabled and the output bypasses. (2) Reusable nonlinear function unit based on\nthe reconfigurable PE. We decompose the exponential function into element-wise\noperations and a shift operation by a fast biased exponential algorithm, and\nthe activation function (SiLU) into a range detection and element-wise\noperations by a piecewise approximation algorithm. Thus, the reconfigurable PEs\nare reused to execute nonlinear functions with negligible accuracy loss.(3)\nIntra-operation and inter-operation buffer management strategy. We propose\nintra-operation buffer management strategy to maximize input data sharing for\nlinear operations within operations, and inter-operation strategy for\nelement-wise operations between operations. We conduct extensive experiments on\nMamba model families with different sizes.MARCA achieves up to\n463.22$\\times$/11.66$\\times$ speedup and up to 9761.42$\\times$/242.52$\\times$\nenergy efficiency compared to Intel Xeon 8358P CPU and NVIDIA Tesla A100 GPU\nimplementations, respectively.\n","authors":["Jinhao Li","Shan Huang","Jiaming Xu","Jun Liu","Li Ding","Ningyi Xu","Guohao Dai"],"pdf_url":"https://arxiv.org/pdf/2409.11440v1.pdf","comment":"9 pages, 10 figures, accepted by ICCAD 2024. arXiv admin note: text\n overlap with arXiv:2001.02514 by other authors"}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2409.07734v2","updated":"2024-09-16T08:18:59Z","published":"2024-09-12T03:44:30Z","title":"DFDG: Data-Free Dual-Generator Adversarial Distillation for One-Shot\n Federated Learning","summary":" Federated Learning (FL) is a distributed machine learning scheme in which\nclients jointly participate in the collaborative training of a global model by\nsharing model information rather than their private datasets. In light of\nconcerns associated with communication and privacy, one-shot FL with a single\ncommunication round has emerged as a de facto promising solution. However,\nexisting one-shot FL methods either require public datasets, focus on model\nhomogeneous settings, or distill limited knowledge from local models, making it\ndifficult or even impractical to train a robust global model. To address these\nlimitations, we propose a new data-free dual-generator adversarial distillation\nmethod (namely DFDG) for one-shot FL, which can explore a broader local models'\ntraining space via training dual generators. DFDG is executed in an adversarial\nmanner and comprises two parts: dual-generator training and dual-model\ndistillation. In dual-generator training, we delve into each generator\nconcerning fidelity, transferability and diversity to ensure its utility, and\nadditionally tailor the cross-divergence loss to lessen the overlap of dual\ngenerators' output spaces. In dual-model distillation, the trained dual\ngenerators work together to provide the training data for updates of the global\nmodel. At last, our extensive experiments on various image classification tasks\nshow that DFDG achieves significant performance gains in accuracy compared to\nSOTA baselines.\n","authors":["Kangyang Luo","Shuai Wang","Yexuan Fu","Renrong Shao","Xiang Li","Yunshi Lan","Ming Gao","Jinlong Shu"],"pdf_url":"https://arxiv.org/pdf/2409.07734v2.pdf","comment":"Accepted by ICDM2024 main conference (long paper)"},{"id":"http://arxiv.org/abs/2409.06955v2","updated":"2024-09-16T08:23:09Z","published":"2024-09-11T02:36:36Z","title":"Privacy-Preserving Federated Learning with Consistency via Knowledge\n Distillation Using Conditional Generator","summary":" Federated Learning (FL) is gaining popularity as a distributed learning\nframework that only shares model parameters or gradient updates and keeps\nprivate data locally. However, FL is at risk of privacy leakage caused by\nprivacy inference attacks. And most existing privacy-preserving mechanisms in\nFL conflict with achieving high performance and efficiency. Therefore, we\npropose FedMD-CG, a novel FL method with highly competitive performance and\nhigh-level privacy preservation, which decouples each client's local model into\na feature extractor and a classifier, and utilizes a conditional generator\ninstead of the feature extractor to perform server-side model aggregation. To\nensure the consistency of local generators and classifiers, FedMD-CG leverages\nknowledge distillation to train local models and generators at both the latent\nfeature level and the logit level. Also, we construct additional classification\nlosses and design new diversity losses to enhance client-side training.\nFedMD-CG is robust to data heterogeneity and does not require training extra\ndiscriminators (like cGAN). We conduct extensive experiments on various image\nclassification tasks to validate the superiority of FedMD-CG.\n","authors":["Kangyang Luo","Shuai Wang","Xiang Li","Yunshi Lan","Ming Gao","Jinlong Shu"],"pdf_url":"https://arxiv.org/pdf/2409.06955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00822v2","updated":"2024-09-16T16:24:05Z","published":"2024-09-01T19:43:40Z","title":"RTop-K: Ultra-Fast Row-Wise Top-K Algorithm and GPU Implementation for\n Neural Networks","summary":" Top-k algorithms are essential in various applications, from high-performance\ncomputing and information retrieval to big data and neural network model\ntraining. This paper introduces RTop-K, a highly efficient parallel row-wise\ntop-k selection algorithm designed for GPUs. RTop-K employs a Binary\nSearch-based approach to optimize resource allocation and provides a scalable\nsolution that significantly accelerates top-k operations. We perform a\ntheoretical analysis of the effects of early stopping in our algorithm,\ndemonstrating that it maintains the accuracy of neural network models while\nenhancing performance. Comprehensive tests show that our GPU implementation of\nRTop-K outperforms other row-wise top-k GPU implementations, with minimal\nimpact on testing accuracy when early stopping is applied. Notably, RTop-K\nachieves speed increases ranging from 4.245$\\times$ to 9.506$\\times$ with early\nstopping, and 3.936$\\times$ without early stopping, compared to\nstate-of-the-art implementations. The proposed methods offer significant\nimprovements in the training and inference of Graph Neural Networks (GNNs),\naddressing critical challenges in latency and throughput on GPU platforms.\n","authors":["Xi Xie","Yuebo Luo","Hongwu Peng","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2409.00822v2.pdf","comment":"Need to improve the experiment part"},{"id":"http://arxiv.org/abs/2206.09563v6","updated":"2024-09-16T16:39:48Z","published":"2022-06-20T04:17:32Z","title":"Scalable Distributed Algorithms for Size-Constrained Submodular\n Maximization in the MapReduce and Adaptive Complexity Models","summary":" Distributed maximization of a submodular function in the MapReduce (MR) model\nhas received much attention, culminating in two frameworks that allow a\ncentralized algorithm to be run in the MR setting without loss of\napproximation, as long as the centralized algorithm satisfies a certain\nconsistency property -- which had previously only been known to be satisfied by\nthe standard greedy and continous greedy algorithms. A separate line of work\nhas studied parallelizability of submodular maximization in the adaptive\ncomplexity model, where each thread may have access to the entire ground set.\nFor the size-constrained maximization of a monotone and submodular function, we\nshow that several sublinearly adaptive (highly parallelizable) algorithms\nsatisfy the consistency property required to work in the MR setting, which\nyields practical, parallelizable and distributed algorithms. Separately, we\ndevelop the first distributed algorithm with linear query complexity for this\nproblem. Finally, we provide a method to increase the maximum cardinality\nconstraint for MR algorithms at the cost of additional MR rounds.\n","authors":["Yixin Chen","Tonmoy Dey","Alan Kuhnle"],"pdf_url":"https://arxiv.org/pdf/2206.09563v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00846v3","updated":"2024-09-16T16:30:09Z","published":"2024-06-02T19:50:05Z","title":"Local Methods with Adaptivity via Scaling","summary":" The rapid development of machine learning and deep learning has introduced\nincreasingly complex optimization challenges that must be addressed. Indeed,\ntraining modern, advanced models has become difficult to implement without\nleveraging multiple computing nodes in a distributed environment. Distributed\noptimization is also fundamental to emerging fields such as federated learning.\nSpecifically, there is a need to organize the training process to minimize the\ntime lost due to communication. A widely used and extensively researched\ntechnique to mitigate the communication bottleneck involves performing local\ntraining before communication. This approach is the focus of our paper.\nConcurrently, adaptive methods that incorporate scaling, notably led by Adam,\nhave gained significant popularity in recent years. Therefore, this paper aims\nto merge the local training technique with the adaptive approach to develop\nefficient distributed learning methods. We consider the classical Local SGD\nmethod and enhance it with a scaling feature. A crucial aspect is that the\nscaling is described generically, allowing us to analyze various approaches,\nincluding Adam, RMSProp, and OASIS, in a unified manner. In addition to\ntheoretical analysis, we validate the performance of our methods in practice by\ntraining a neural network.\n","authors":["Savelii Chezhegov","Sergey Skorik","Nikolas Khachaturov","Danil Shalagin","Aram Avetisyan","Martin Takáč","Yaroslav Kholodov","Aleksandr Beznosikov"],"pdf_url":"https://arxiv.org/pdf/2406.00846v3.pdf","comment":"41 pages, 2 algorithms, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.10392v1","updated":"2024-09-16T15:27:35Z","published":"2024-09-16T15:27:35Z","title":"TPFL: Tsetlin-Personalized Federated Learning with Confidence-Based\n Clustering","summary":" The world of Machine Learning (ML) has witnessed rapid changes in terms of\nnew models and ways to process users data. The majority of work that has been\ndone is focused on Deep Learning (DL) based approaches. However, with the\nemergence of new algorithms such as the Tsetlin Machine (TM) algorithm, there\nis growing interest in exploring alternative approaches that may offer unique\nadvantages in certain domains or applications. One of these domains is\nFederated Learning (FL), in which users privacy is of utmost importance. Due to\nits novelty, FL has seen a surge in the incorporation of personalization\ntechniques to enhance model accuracy while maintaining user privacy under\npersonalized conditions. In this work, we propose a novel approach dubbed TPFL:\nTsetlin-Personalized Federated Learning, in which models are grouped into\nclusters based on their confidence towards a specific class. In this way,\nclustering can benefit from two key advantages. Firstly, clients share only\nwhat they are confident about, resulting in the elimination of wrongful weight\naggregation among clients whose data for a specific class may have not been\nenough during the training. This phenomenon is prevalent when the data are\nnon-Independent and Identically Distributed (non-IID). Secondly, by sharing\nonly weights towards a specific class, communication cost is substantially\nreduced, making TPLF efficient in terms of both accuracy and communication\ncost. The results of TPFL demonstrated the highest accuracy on three different\ndatasets; namely MNIST, FashionMNIST and FEMNIST.\n","authors":["Rasoul Jafari Gohari","Laya Aliahmadipour","Ezat Valipour"],"pdf_url":"https://arxiv.org/pdf/2409.10392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10325v1","updated":"2024-09-16T14:36:22Z","published":"2024-09-16T14:36:22Z","title":"PASS: An Asynchronous Probabilistic Processor for Next Generation\n Intelligence","summary":" New computing paradigms are required to solve the most challenging\ncomputational problems where no exact polynomial time solution\nexists.Probabilistic Ising Accelerators has gained promise on these problems\nwith the ability to model complex probability distributions and find ground\nstates of intractable problems. In this context, we have demonstrated the\nParallel Asynchronous Stochastic Sampler (PASS), the first fully on-chip\nintegrated, asynchronous, probabilistic accelerator that takes advantage of the\nintrinsic fine-grained parallelism of the Ising Model and built in state of the\nart 14nm CMOS FinFET technology. We have demonstrated broad applicability of\nthis accelerator on problems ranging from Combinatorial Optimization, Neural\nSimulation, to Machine Learning along with up to $23,000$x energy to solution\nimprovement compared to CPUs on probabilistic problems.\n","authors":["Saavan Patel","Philip Canoza","Adhiraj Datar","Steven Lu","Chirag Garg","Sayeef Salahuddin"],"pdf_url":"https://arxiv.org/pdf/2409.10325v1.pdf","comment":"13 page main text, 5 main figures, 21 pages supplementary and\n methods, 7 supplementary figures, 2 supplementary tables"},{"id":"http://arxiv.org/abs/2409.10235v1","updated":"2024-09-16T12:33:41Z","published":"2024-09-16T12:33:41Z","title":"Maintaining Distributed Data Structures in Dynamic Peer-to-Peer Networks","summary":" We study robust and efficient distributed algorithms for building and\nmaintaining distributed data structures in dynamic Peer-to-Peer (P2P) networks.\nP2P networks are characterized by a high level of dynamicity with abrupt heavy\nnode \\emph{churn} (nodes that join and leave the network continuously over\ntime). We present a novel algorithm that builds and maintains with high\nprobability a skip list for $poly(n)$ rounds despite $\\mathcal{O}(n/\\log n)$\nchurn \\emph{per round} ($n$ is the stable network size). We assume that the\nchurn is controlled by an oblivious adversary (that has complete knowledge and\ncontrol of what nodes join and leave and at what time and has unlimited\ncomputational power, but is oblivious to the random choices made by the\nalgorithm). Moreover, the maintenance overhead is proportional to the churn\nrate. Furthermore, the algorithm is scalable in that the messages are small\n(i.e., at most $polylog(n)$ bits) and every node sends and receives at most\n$polylog(n)$ messages per round.\n Our algorithm crucially relies on novel distributed and parallel algorithms\nto merge two $n$-elements skip lists and delete a large subset of items, both\nin $\\mathcal{O}(\\log n)$ rounds with high probability. These procedures may be\nof independent interest due to their elegance and potential applicability in\nother contexts in distributed data structures.\n To the best of our knowledge, our work provides the first-known\nfully-distributed data structure that provably works under highly dynamic\nsettings (i.e., high churn rate). Furthermore, they are localized (i.e., do not\nrequire any global topological knowledge). Finally, we believe that our\nframework can be generalized to other distributed and dynamic data structures\nincluding graphs, potentially leading to stable distributed computation despite\nheavy churn.\n","authors":["John Augustine","Antonio Cruciani","Iqra Altaf Gillani"],"pdf_url":"https://arxiv.org/pdf/2409.10235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10226v1","updated":"2024-09-16T12:21:04Z","published":"2024-09-16T12:21:04Z","title":"Privacy-Preserving Distributed Maximum Consensus Without Accuracy Loss","summary":" In distributed networks, calculating the maximum element is a fundamental\ntask in data analysis, known as the distributed maximum consensus problem.\nHowever, the sensitive nature of the data involved makes privacy protection\nessential. Despite its importance, privacy in distributed maximum consensus has\nreceived limited attention in the literature. Traditional privacy-preserving\nmethods typically add noise to updates, degrading the accuracy of the final\nresult. To overcome these limitations, we propose a novel distributed\noptimization-based approach that preserves privacy without sacrificing\naccuracy. Our method introduces virtual nodes to form an augmented graph and\nleverages a carefully designed initialization process to ensure the privacy of\nhonest participants, even when all their neighboring nodes are dishonest.\nThrough a comprehensive information-theoretical analysis, we derive a\nsufficient condition to protect private data against both passive and\neavesdropping adversaries. Extensive experiments validate the effectiveness of\nour approach, demonstrating that it not only preserves perfect privacy but also\nmaintains accuracy, outperforming existing noise-based methods that typically\nsuffer from accuracy loss.\n","authors":["Wenrui Yu","Richard Heusdens","Jun Pang","Qiongxiu Li"],"pdf_url":"https://arxiv.org/pdf/2409.10226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09934v1","updated":"2024-09-16T02:17:58Z","published":"2024-09-16T02:17:58Z","title":"Coordination-free Collaborative Replication based on Operational\n Transformation","summary":" We introduce Coordination-free Collaborative Replication (CCR), a new method\nfor maintaining consistency across replicas in distributed systems without\nrequiring explicit coordination messages. CCR automates conflict resolution,\ncontrasting with traditional Data-sharing systems that typically involve\ncentralized update management or predefined consistency rules.\n Operational Transformation (OT), commonly used in collaborative editing,\nensures consistency by transforming operations while maintaining document\nintegrity across replicas. However, OT assumes server-based coordination, which\nis unsuitable for modern, decentralized Peer-to-Peer (P2P) systems.\n Conflict-free Replicated Data Type (CRDT), like Two-Phase Sets (2P-Sets),\nguarantees eventual consistency by allowing commutative and associative\noperations but often result in counterintuitive behaviors, such as failing to\nre-add an item to a shopping cart once removed.\n In contrast, CCR employs a more intuitive approach to replication. It allows\nfor straightforward updates and conflict resolution based on the current data\nstate, enhancing clarity and usability compared to CRDTs. Furthermore, CCR\naddresses inefficiencies in messaging by developing a versatile protocol based\non data stream confluence, thus providing a more efficient and practical\nsolution for collaborative data sharing in distributed systems.\n","authors":["Masato Takeichi"],"pdf_url":"https://arxiv.org/pdf/2409.09934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07734v2","updated":"2024-09-16T08:18:59Z","published":"2024-09-12T03:44:30Z","title":"DFDG: Data-Free Dual-Generator Adversarial Distillation for One-Shot\n Federated Learning","summary":" Federated Learning (FL) is a distributed machine learning scheme in which\nclients jointly participate in the collaborative training of a global model by\nsharing model information rather than their private datasets. In light of\nconcerns associated with communication and privacy, one-shot FL with a single\ncommunication round has emerged as a de facto promising solution. However,\nexisting one-shot FL methods either require public datasets, focus on model\nhomogeneous settings, or distill limited knowledge from local models, making it\ndifficult or even impractical to train a robust global model. To address these\nlimitations, we propose a new data-free dual-generator adversarial distillation\nmethod (namely DFDG) for one-shot FL, which can explore a broader local models'\ntraining space via training dual generators. DFDG is executed in an adversarial\nmanner and comprises two parts: dual-generator training and dual-model\ndistillation. In dual-generator training, we delve into each generator\nconcerning fidelity, transferability and diversity to ensure its utility, and\nadditionally tailor the cross-divergence loss to lessen the overlap of dual\ngenerators' output spaces. In dual-model distillation, the trained dual\ngenerators work together to provide the training data for updates of the global\nmodel. At last, our extensive experiments on various image classification tasks\nshow that DFDG achieves significant performance gains in accuracy compared to\nSOTA baselines.\n","authors":["Kangyang Luo","Shuai Wang","Yexuan Fu","Renrong Shao","Xiang Li","Yunshi Lan","Ming Gao","Jinlong Shu"],"pdf_url":"https://arxiv.org/pdf/2409.07734v2.pdf","comment":"Accepted by ICDM2024 main conference (long paper). arXiv admin note:\n substantial text overlap with arXiv:2309.13546"},{"id":"http://arxiv.org/abs/2409.10770v1","updated":"2024-09-16T23:07:57Z","published":"2024-09-16T23:07:57Z","title":"HPC with Enhanced User Separation","summary":" HPC systems used for research run a wide variety of software and workflows.\nThis software is often written or modified by users to meet the needs of their\nresearch projects, and rarely is built with security in mind. In this paper we\nexplore several of the key techniques that MIT Lincoln Laboratory\nSupercomputing Center has deployed on its systems to manage the security\nimplications of these workflows by providing enforced separation for processes,\nfilesystem access, network traffic, and accelerators to make every user feel\nlike they are running on a personal HPC.\n","authors":["Andrew Prout","Albert Reuther","Michael Houle","Michael Jones","Peter Michaleas","LaToya Anderson","William Arcand","Bill Bergeron","David Bestor","Alex Bonn","Daniel Burrill","Chansup Byun","Vijay Gadepally","Matthew Hubbell","Hayden Jananthan","Piotr Luszczek","Lauren Milechin","Guillermo Morales","Julie Mullen","Antonio Rosa","Charles Yee","Jeremy Kepner"],"pdf_url":"https://arxiv.org/pdf/2409.10770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10743v1","updated":"2024-09-16T21:21:40Z","published":"2024-09-16T21:21:40Z","title":"Advances in ArborX to support exascale applications","summary":" ArborX is a performance portable geometric search library developed as part\nof the Exascale Computing Project (ECP). In this paper, we explore a\ncollaboration between ArborX and a cosmological simulation code HACC. Large\ncosmological simulations on exascale platforms encounter a bottleneck due to\nthe in-situ analysis requirements of halo finding, a problem of identifying\ndense clusters of dark matter (halos). This problem is solved by using a\ndensity-based DBSCAN clustering algorithm. With each MPI rank handling hundreds\nof millions of particles, it is imperative for the DBSCAN implementation to be\nefficient. In addition, the requirement to support exascale supercomputers from\ndifferent vendors necessitates performance portability of the algorithm. We\ndescribe how this challenge problem guided ArborX development, and enhanced the\nperformance and the scope of the library. We explore the improvements in the\nbasic algorithms for the underlying search index to improve the performance,\nand describe several implementations of DBSCAN in ArborX. Further, we report\nthe history of the changes in ArborX and their effect on the time to solve a\nrepresentative benchmark problem, as well as demonstrate the real world impact\non production end-to-end cosmology simulations.\n","authors":["Andrey Prokopenko","Daniel Arndt","Damien Lebrun-Grandié","Bruno Turcksin","Nicholas Frontiere","J. D. Emberson","Michael Buehlmann"],"pdf_url":"https://arxiv.org/pdf/2409.10743v1.pdf","comment":"Submitted to IJHPCA"},{"id":"http://arxiv.org/abs/2409.10727v1","updated":"2024-09-16T21:02:59Z","published":"2024-09-16T21:02:59Z","title":"Deterministic Bounds in Committee Selection: Enhancing Decentralization\n and Scalability in Distributed Ledgers","summary":" Consensus plays a crucial role in distributed ledger systems, impacting both\nscalability and decentralization. Many blockchain systems use a weighted\nlottery based on a scarce resource such as a stake, storage, memory, or\ncomputing power to select a committee whose members drive the consensus and are\nresponsible for adding new information to the ledger. Therefore, ensuring a\nrobust and fair committee selection process is essential for maintaining\nsecurity, efficiency, and decentralization.\n There are two main approaches to randomized committee selection. In one\napproach, each validator candidate locally checks whether they are elected to\nthe committee and reveals their proof during the consensus phase. In contrast,\nin the second approach, a sortition algorithm decides a fixed-sized committee\nthat is globally verified. This paper focuses on the latter approach, with\ncryptographic sortition as a method for fair committee selection that\nguarantees a constant committee size. Our goal is to develop deterministic\nguarantees that strengthen decentralization. We introduce novel methods that\nprovide deterministic bounds on the influence of adversaries within the\ncommittee, as evidenced by numerical experiments. This approach overcomes the\nlimitations of existing protocols that only offer probabilistic guarantees,\noften providing large committees that are impractical for many quorum-based\napplications like atomic broadcast and randomness beacon protocols.\n","authors":["Grigorii Melnikov","Sebastian Müller","Nikita Polyanskii","Yury Yanovich"],"pdf_url":"https://arxiv.org/pdf/2409.10727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00829v2","updated":"2024-09-16T20:24:40Z","published":"2024-04-03T18:07:53Z","title":"SABLE: Staging Blocked Evaluation of Sparse Matrix Computations","summary":" Sparse Matrices found in the real world often have some structure in their\ndistribution of dense elements. While existing techniques specialize the\ngenerated code for the structure of matrices, their generality misses\noptimization opportunities. We propose a system that -- if the sparse matrix is\nstored in a blocked storage format -- can adapt its code generation strategy\ndepending on the structure of the sparse matrix. Our system SABLE performs a\nspecified computation over every element of {\\em mostly} dense blocks instead\nof avoiding computing any sparse element and achieving regularity in generated\ncode while having special treatment for hyper-sparse blocks (ie, blocks with\nvery few dense elements). SABLE is extensible, providing a block iterator for\nusers to express any computation over these non-empty blocks. We demonstrate\nthat our approach can significantly accelerate SpMV and SpMM operations,\nsurpassing the performance of state-of-the-art systems like Partially-Strided\nCodelets and Sparse Register Tiling.\n","authors":["Pratyush Das","Adhitha Dias","Anxhelo Xhebraj","Artem Pelenitsyn","Kirshanthan Sundararajah","Milind Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2407.00829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10661v1","updated":"2024-09-16T18:52:31Z","published":"2024-09-16T18:52:31Z","title":"A Study of Performance Programming of CPU, GPU accelerated Computers and\n SIMD Architecture","summary":" Parallel computing is a standard approach to achieving high-performance\ncomputing (HPC). Three commonly used methods to implement parallel computing\ninclude: 1) applying multithreading technology on single-core or multi-core\nCPUs; 2) incorporating powerful parallel computing devices such as GPUs, FPGAs,\nand other accelerators; and 3) utilizing special parallel architectures like\nSingle Instruction/Multiple Data (SIMD).\n Many researchers have made efforts using different parallel technologies,\nincluding developing applications, conducting performance analyses, identifying\nperformance bottlenecks, and proposing feasible solutions. However, balancing\nand optimizing parallel programs remain challenging due to the complexity of\nparallel algorithms and hardware architectures. Issues such as data transfer\nbetween hosts and devices in heterogeneous systems continue to be bottlenecks\nthat limit performance.\n This work summarizes a vast amount of information on various parallel\nprogramming techniques, aiming to present the current state and future\ndevelopment trends of parallel programming, performance issues, and solutions.\nIt seeks to give readers an overall picture and provide background knowledge to\nsupport subsequent research.\n","authors":["Xinyao Yi"],"pdf_url":"https://arxiv.org/pdf/2409.10661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11442v1","updated":"2024-09-16T20:03:57Z","published":"2024-09-16T20:03:57Z","title":"A Green Multi-Attribute Client Selection for Over-The-Air Federated\n Learning: A Grey-Wolf-Optimizer Approach","summary":" Federated Learning (FL) has gained attention across various industries for\nits capability to train machine learning models without centralizing sensitive\ndata. While this approach offers significant benefits such as privacy\npreservation and decreased communication overhead, it presents several\nchallenges, including deployment complexity and interoperability issues,\nparticularly in heterogeneous scenarios or resource-constrained environments.\nOver-the-air (OTA) FL was introduced to tackle these challenges by\ndisseminating model updates without necessitating direct device-to-device\nconnections or centralized servers. However, OTA-FL brought forth limitations\nassociated with heightened energy consumption and network latency. In this\npaper, we propose a multi-attribute client selection framework employing the\ngrey wolf optimizer (GWO) to strategically control the number of participants\nin each round and optimize the OTA-FL process while considering accuracy,\nenergy, delay, reliability, and fairness constraints of participating devices.\nWe evaluate the performance of our multi-attribute client selection approach in\nterms of model loss minimization, convergence time reduction, and energy\nefficiency. In our experimental evaluation, we assessed and compared the\nperformance of our approach against the existing state-of-the-art methods. Our\nresults demonstrate that the proposed GWO-based client selection outperforms\nthese baselines across various metrics. Specifically, our approach achieves a\nnotable reduction in model loss, accelerates convergence time, and enhances\nenergy efficiency while maintaining high fairness and reliability indicators.\n","authors":["Maryam Ben Driss","Essaid Sabir","Halima Elbiaze","Abdoulaye Baniré Diallo","Mohamed Sadik"],"pdf_url":"https://arxiv.org/pdf/2409.11442v1.pdf","comment":null}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2409.10231v1","updated":"2024-09-16T12:28:15Z","published":"2024-09-16T12:28:15Z","title":"High-level quantum algorithm programming using Silq","summary":" Quantum computing, with its vast potential, is fundamentally shaped by the\nintricacies of quantum mechanics, which both empower and constrain its\ncapabilities. The development of a universal, robust quantum programming\nlanguage has emerged as a key research focus in this rapidly evolving field.\nThis paper explores Silq, a recent high-level quantum programming language,\nhighlighting its strengths and unique features. We aim to share our insights on\ndesigning and implementing high-level quantum algorithms using Silq,\ndemonstrating its practical applications and advantages for quantum\nprogramming.\n","authors":["Viktorija Bezganovic","Marco Lewis","Sadegh Soudjani","Paolo Zuliani"],"pdf_url":"https://arxiv.org/pdf/2409.10231v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2409.09934v1","updated":"2024-09-16T02:17:58Z","published":"2024-09-16T02:17:58Z","title":"Coordination-free Collaborative Replication based on Operational\n Transformation","summary":" We introduce Coordination-free Collaborative Replication (CCR), a new method\nfor maintaining consistency across replicas in distributed systems without\nrequiring explicit coordination messages. CCR automates conflict resolution,\ncontrasting with traditional Data-sharing systems that typically involve\ncentralized update management or predefined consistency rules.\n Operational Transformation (OT), commonly used in collaborative editing,\nensures consistency by transforming operations while maintaining document\nintegrity across replicas. However, OT assumes server-based coordination, which\nis unsuitable for modern, decentralized Peer-to-Peer (P2P) systems.\n Conflict-free Replicated Data Type (CRDT), like Two-Phase Sets (2P-Sets),\nguarantees eventual consistency by allowing commutative and associative\noperations but often result in counterintuitive behaviors, such as failing to\nre-add an item to a shopping cart once removed.\n In contrast, CCR employs a more intuitive approach to replication. It allows\nfor straightforward updates and conflict resolution based on the current data\nstate, enhancing clarity and usability compared to CRDTs. Furthermore, CCR\naddresses inefficiencies in messaging by developing a versatile protocol based\non data stream confluence, thus providing a more efficient and practical\nsolution for collaborative data sharing in distributed systems.\n","authors":["Masato Takeichi"],"pdf_url":"https://arxiv.org/pdf/2409.09934v1.pdf","comment":null}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2409.10464v1","updated":"2024-09-16T16:56:28Z","published":"2024-09-16T16:56:28Z","title":"New Direct Sum Tests","summary":" A function $f:[n]^{d} \\to \\mathbb{F}_2$ is a \\defn{direct sum} if there are\nfunctions $L_i:[n]\\to \\mathbb{F}_2$ such that ${f(x) = \\sum_{i}L_i(x_i)}$. In\nthis work we give multiple results related to the property testing of direct\nsums.\n Our first result concerns a test proposed by Dinur and Golubev in 2019. We\ncall their test the Diamond test and show that it is indeed a direct sum\ntester. More specifically, we show that if a function $f$ is $\\epsilon$-far\nfrom being a direct sum function, then the Diamond test rejects $f$ with\nprobability at least $\\Omega_{n,\\epsilon}(1)$. Even in the case of $n = 2$, the\nDiamond test is, to the best of our knowledge, novel and yields a new tester\nfor the classic property of affinity.\n Apart from the Diamond test, we also analyze a broad family of direct sum\ntests, which at a high level, run an arbitrary affinity test on the restriction\nof $f$ to a random hypercube inside of $[n]^d$. This family of tests includes\nthe direct sum test analyzed in \\cite{di19}, but does not include the Diamond\ntest. As an application of our result, we obtain a direct sum test which works\nin the online adversary model of \\cite{KRV}.\n Finally, we also discuss a Fourier analytic interpretation of the diamond\ntester in the $n=2$ case, as well as prove local correction results for direct\nsum as conjectured by Dinur and Golubev.\n","authors":["Alek Westover","Edward Yu","Kai Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.10464v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2404.01022v3","updated":"2024-09-16T11:47:39Z","published":"2024-04-01T10:04:37Z","title":"On the Complexity of Minimizing Energy Consumption of Partitioning DAG\n Tasks","summary":" We study a graph partition problem where we are given a directed acyclic\ngraph (DAG) whose vertices and arcs can be respectively regarded as tasks and\ndependencies among tasks. The objective of the problem is to minimize the total\nenergy consumed for completing these tasks by assigning the tasks to k\nheterogeneous machines. We first show that the problem is NP-hard. Then, we\npresent polynomial-time algorithms for two special cases where there are only\ntwo machines and where the input DAG is a directed path. Finally, we study a\nnatural variant where there are only two machines with one of them being\ncapable of executing a limited number of tasks. We show that this special case\nremains computationally hard.\n","authors":["Wei Liu","Jian-Jia Chen","Yongjie Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01022v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10155v1","updated":"2024-09-16T10:39:57Z","published":"2024-09-16T10:39:57Z","title":"Efficient approximation schemes for scheduling on a stochastic number of\n machines","summary":" We study three two-stage optimization problems with a similar structure and\ndifferent objectives. In the first stage of each problem, the goal is to assign\ninput jobs of positive sizes to unsplittable bags. After this assignment is\ndecided, the realization of the number of identical machines that will be\navailable is revealed. Then, in the second stage, the bags are assigned to\nmachines. The probability vector of the number of machines in the second stage\nis known to the algorithm as part of the input before making the decisions of\nthe first stage. Thus, the vector of machine completion times is a random\nvariable. The goal of the first problem is to minimize the expected value of\nthe makespan of the second stage schedule, while the goal of the second problem\nis to maximize the expected value of the minimum completion time of the\nmachines in the second stage solution. The goal of the third problem is to\nminimize the \\ell_p norm for a fixed p>1, where the norm is applied on\nmachines' completion times vectors. Each one of the first two problems admits a\nPTAS as Buchem et al. showed recently. Here we significantly improve all their\nresults by designing an EPTAS for each one of these problems. We also design an\nEPTAS for \\ell_p norm minimization for any p>1.\n","authors":["Leah Epstein","Asaf Levin"],"pdf_url":"https://arxiv.org/pdf/2409.10155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.04142v3","updated":"2024-09-16T09:38:36Z","published":"2021-12-06T11:46:20Z","title":"A Reply to \"On Salum's Algorithm for X3SAT\"","summary":" This paper is a reply to \"On Salum's Algorithm for X3SAT\" (arXiv:2104.02886)\n","authors":["Latif Salum"],"pdf_url":"https://arxiv.org/pdf/2203.04142v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10476v4","updated":"2024-09-16T18:30:38Z","published":"2024-06-15T02:54:55Z","title":"On $NP$ versus ${\\rm co}NP$ and Frege Systems","summary":" We prove in this paper that there is a language $L_d$ accepted by some\nnondeterministic Turing machines but not by any ${\\rm co}\\mathcal{NP}$-machines\n(defined later). Then we further show that $L_d$ is in $\\mathcal{NP}$, thus\nproving that $\\mathcal{NP}\\neq{\\rm co}\\mathcal{NP}$. The techniques used in\nthis paper are lazy-diagonalization and the novel new technique developed in\nauthor's recent work \\cite{Lin21}. As a by-product, we reach the important\nresult that $\\mathcal{P}\\neq\\mathcal{NP}$ \\cite{Lin21} once again, which is\nclear from the above outcome and the well-known fact that $\\mathcal{P}={\\rm\nco}\\mathcal{P}$. Next, we show that the complexity class ${\\rm co}\\mathcal{NP}$\nhas intermediate languages, i.e., there are language $L_{inter}\\in{\\rm\nco}\\mathcal{NP}$ which is not in $\\mathcal{P}$ and not ${\\rm\nco}\\mathcal{NP}$-complete. We also summarize other direct consequences such as\n$\\mathcal{NEXP}\\neq{\\rm co}\\mathcal{NEXP}$ and other which is in the area of\nproof complexity implied by our main outcome. Lastly, we show a lower bounds\nresult for Frege proof systems, i.e., no Frege proof systems can be polynomial\nbounded.\n","authors":["Tianrong Lin"],"pdf_url":"https://arxiv.org/pdf/2406.10476v4.pdf","comment":"[v4] 30 pages; further improved; arXiv admin note: text overlap with\n arXiv:2110.06211"},{"id":"http://arxiv.org/abs/2311.00882v3","updated":"2024-09-16T19:55:38Z","published":"2023-11-01T22:15:19Z","title":"Semidefinite programming and linear equations vs. homomorphism problems","summary":" We introduce a relaxation for homomorphism problems that combines\nsemidefinite programming with linear Diophantine equations, and propose a\nframework for the analysis of its power based on the spectral theory of\nassociation schemes. We use this framework to establish an unconditional lower\nbound against the semidefinite programming + linear equations model, by showing\nthat the relaxation does not solve the approximate graph homomorphism problem\nand thus, in particular, the approximate graph colouring problem.\n","authors":["Lorenzo Ciardo","Stanislav Živný"],"pdf_url":"https://arxiv.org/pdf/2311.00882v3.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2304.06348v3","updated":"2024-09-16T17:57:11Z","published":"2023-04-13T08:57:17Z","title":"Decidability of Querying First-Order Theories via Countermodels of\n Finite Width","summary":" We propose a generic framework for establishing the decidability of a wide\nrange of logical entailment problems (briefly called querying), based on the\nexistence of countermodels that are structurally simple, gauged by certain\ntypes of width measures (with treewidth and cliquewidth as popular examples).\nAs an important special case of our framework, we identify logics exhibiting\nwidth-finite finitely universal model sets, warranting decidable entailment for\na wide range of homomorphism-closed queries, subsuming a diverse set of\npractically relevant query languages. As a particularly powerful width measure,\nwe propose to employ Blumensath's partitionwidth, which subsumes various other\ncommonly considered width measures and exhibits highly favorable computational\nand structural properties. Focusing on the formalism of existential rules as a\npopular showcase, we explain how finite partitionwidth sets of rules subsume\nother known abstract decidable classes but - leveraging existing notions of\nstratification - also cover a wide range of new rulesets. We expose natural\nlimitations for fitting the class of finite unification sets into our picture\nand suggest several options for remedy.\n","authors":["Thomas Feller","Tim S. Lyon","Piotr Ostropolski-Nalewaja","Sebastian Rudolph"],"pdf_url":"https://arxiv.org/pdf/2304.06348v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08489v3","updated":"2024-09-16T14:55:24Z","published":"2023-05-15T09:45:30Z","title":"Extensional Taylor Expansion","summary":" We introduce a calculus of extensional resource terms. These are resource\nterms \\`a la Ehrhard-Regnier, but in infinitely eta-long form. The calculus\nstill retains a finite syntax and dynamics: in particular, we prove strong\nconfluence and normalization.\n Then we define an extensional version of Taylor expansion, mapping ordinary\nlambda-terms to (possibly infinite) linear combinations of extensional resource\nterms: like in the ordinary case, the dynamics of our resource calculus allows\nus to simulate the beta-reduction of lambda-terms; the extensional nature of\nthis expansion shows in the fact that we are also able to simulate\neta-reduction.\n In a sense, extensional resource terms contain a language of finite\napproximants of Nakajima trees, much like ordinary resource terms can be seen\nas a richer version of finite B\\\"ohm trees. We show that the equivalence\ninduced on lambda-terms by the normalization of extensional Taylor-expansion is\nnothing but H*, the greatest consistent sensible lambda-theory - which is also\nthe theory induced by Nakajima trees. This characterization provides a new,\nsimple way to exhibit models of H*: it becomes sufficient to model the\nextensional resource calculus and its dynamics.\n The extensional resource calculus moreover allows us to recover, in an\nuntyped setting, a connection between Taylor expansion and game semantics that\nwas previously limited to the typed setting. Indeed, simply typed, eta-long,\nbeta-normal resource terms are known to be in bijective correspondence with\nplays in the sense of Hyland-Ong game semantics, up to Melli\\`es' homotopy\nequivalence. Extensional resource terms are the appropriate counterpart of\neta-long resource terms in an untyped setting: we spell out the bijection\nbetween normal extensional resource terms and isomorphism classes of\naugmentations (a canonical presentation of plays up to homotopy) in the\nuniversal arena.\n","authors":["Lison Blondeau-Patissier","Pierre Clairambault","Lionel Vaux Auclair"],"pdf_url":"https://arxiv.org/pdf/2305.08489v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10237v1","updated":"2024-09-16T12:35:00Z","published":"2024-09-16T12:35:00Z","title":"Directed equality with dinaturality","summary":" We show how dinaturality plays a central role in the interpretation of\ndirected type theory where types are interpreted as (1-)categories and directed\nequality is represented by $\\hom$-functors. We present a general elimination\nprinciple based on dinaturality for directed equality which very closely\nresembles the $J$-rule used in Martin-L\\\"of type theory, and we highlight which\nsyntactical restrictions are needed to interpret this rule in the context of\ndirected equality. We then use these rules to characterize directed equality as\na left relative adjoint to a functor between (para)categories of dinatural\ntransformations which contracts together two variables appearing naturally with\na single dinatural one, with the relative functor imposing the syntactic\nrestrictions needed. We then argue that the quantifiers of such a directed type\ntheory should be interpreted as ends and coends, which dinaturality allows us\nto present in adjoint-like correspondences to a weakening functor. Using these\nrules we give a formal interpretation to Yoneda reductions and (co)end\ncalculus, and we use logical derivations to prove the Fubini rule for\nquantifier exchange, the adjointness property of Kan extensions via (co)ends,\nexponential objects of presheaves, and the (co)Yoneda lemma. We show\ntransitivity (composition), congruence (functoriality), and transport\n(coYoneda) for directed equality by closely following the same approach of\nMartin-L\\\"of type theory, with the notable exception of symmetry. We formalize\nour main theorems in Agda.\n","authors":["Andrea Laretto","Fosco Loregian","Niccolò Veltri"],"pdf_url":"https://arxiv.org/pdf/2409.10237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10170v1","updated":"2024-09-16T11:02:47Z","published":"2024-09-16T11:02:47Z","title":"Minimal Model Counting via Knowledge Compilation","summary":" Counting the number of models of a Boolean formula is a fundamental problem\nin artificial intelligence and reasoning. Minimal models of a Boolean formula\nare critical in various reasoning systems, making the counting of minimal\nmodels essential for detailed inference tasks. Existing research primarily\nfocused on decision problems related to minimal models. In this work, we extend\nbeyond decision problems to address the challenge of counting minimal models.\nSpecifically, we propose a novel knowledge compilation form that facilitates\nthe efficient counting of minimal models. Our approach leverages the idea of\njustification and incorporates theories from answer set counting.\n","authors":["Mohimenul Kabir"],"pdf_url":"https://arxiv.org/pdf/2409.10170v1.pdf","comment":null}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2407.00829v2","updated":"2024-09-16T20:24:40Z","published":"2024-04-03T18:07:53Z","title":"SABLE: Staging Blocked Evaluation of Sparse Matrix Computations","summary":" Sparse Matrices found in the real world often have some structure in their\ndistribution of dense elements. While existing techniques specialize the\ngenerated code for the structure of matrices, their generality misses\noptimization opportunities. We propose a system that -- if the sparse matrix is\nstored in a blocked storage format -- can adapt its code generation strategy\ndepending on the structure of the sparse matrix. Our system SABLE performs a\nspecified computation over every element of {\\em mostly} dense blocks instead\nof avoiding computing any sparse element and achieving regularity in generated\ncode while having special treatment for hyper-sparse blocks (ie, blocks with\nvery few dense elements). SABLE is extensible, providing a block iterator for\nusers to express any computation over these non-empty blocks. We demonstrate\nthat our approach can significantly accelerate SpMV and SpMM operations,\nsurpassing the performance of state-of-the-art systems like Partially-Strided\nCodelets and Sparse Register Tiling.\n","authors":["Pratyush Das","Adhitha Dias","Anxhelo Xhebraj","Artem Pelenitsyn","Kirshanthan Sundararajah","Milind Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2407.00829v2.pdf","comment":null}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/1906.10093v2","updated":"2024-09-16T17:13:54Z","published":"2019-06-24T17:18:12Z","title":"Efficient Analysis of Unambiguous Automata Using Matrix Semigroup\n Techniques","summary":" We introduce a novel technique to analyse unambiguous B\\\"uchi automata\nquantitatively, and apply this to the model checking problem. It is based on\nlinear-algebra arguments that originate from the analysis of matrix semigroups\nwith constant spectral radius. This method can replace a combinatorial\nprocedure that dominates the computational complexity of the existing procedure\nby Baier et al. We analyse the complexity in detail, showing that, in terms of\nthe set $Q$ of states of the automaton, the new algorithm runs in time\n$O(|Q|^4)$, improving on an efficient implementation of the combinatorial\nalgorithm by a factor of $|Q|$.\n","authors":["Stefan Kiefer","Cas Widdershoven"],"pdf_url":"https://arxiv.org/pdf/1906.10093v2.pdf","comment":"Technical report for an MFCS'19 paper. This version fixes a bug in\n Appendix A"}]},"2024-09-15T00:00:00Z":{"Computational Complexity":[{"id":"http://arxiv.org/abs/2402.08434v4","updated":"2024-09-15T20:08:40Z","published":"2024-02-13T13:03:49Z","title":"Solving promise equations over monoids and groups","summary":" We give a complete complexity classification for the problem of finding a\nsolution to a given system of equations over a fixed finite monoid, given that\na solution over a more restricted monoid exists. As a corollary, we obtain a\ncomplexity classification for the same problem over groups.\n","authors":["Alberto Larrauri","Stanislav Živný"],"pdf_url":"https://arxiv.org/pdf/2402.08434v4.pdf","comment":"Full version of an ICALP 2024 paper"},{"id":"http://arxiv.org/abs/2409.09734v1","updated":"2024-09-15T13:51:02Z","published":"2024-09-15T13:51:02Z","title":"Complexity and algorithms for Swap median and relation to other\n consensus problems","summary":" Genome rearrangements are events in which large blocks of DNA exchange pieces\nduring evolution. The analysis of such events is a tool for understanding\nevolutionary genomics, based on finding the minimum number of rearrangements to\ntransform one genome into another. In a general scenario, more than two genomes\nare considered and we have new challenges. The {\\sc Median} problem consists in\nfinding, given three permutations and a distance metric, a permutation $s$ that\nminimizes the sum of the distances between $s$ and each input. We study the\n{\\sc median} problem over \\emph{swap} distances in permutations, for which the\ncomputational complexity has been open for almost 20 years (Eriksen,\n\\emph{Theor. Compt. Sci.}, 2007). We consider this problem through some\nbranches. We associate median solutions and interval convex sets, where the\nconcept of graph convexity inspires the following investigation: Does a median\npermutation belong to every shortest path between one of the pairs of input\npermutations? We are able to partially answer this question, and as a\nby-product we solve a long open problem by proving that the {\\sc Swap Median}\nproblem is NP-hard. Furthermore, using a similar approach, we show that the\n{\\sc Closest} problem, which seeks to minimize the maximum distance between the\nsolution and the input permutations, is NP-hard even considering three input\npermutations. This gives a sharp dichotomy into the P vs. NP-hard approaches,\nsince considering two input permutations the problem is easily solvable and\nconsidering any number of input permutations it is known to be NP-hard since\n2007 (Popov, \\emph{Theor. Compt. Sci.}, 2007). In addition, we show that {\\sc\nSwap Median} and {\\sc Swap Closest} are APX-hard problems.\n","authors":["Luís Cunha","Thiago Lopes","Arnaud Mary"],"pdf_url":"https://arxiv.org/pdf/2409.09734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12982v2","updated":"2024-09-15T12:38:48Z","published":"2022-11-23T14:41:02Z","title":"The Stochastic Arrival Problem","summary":" We study a new modification of the Arrival problem, which allows for nodes\nthat exhibit random as well as controlled behaviour, in addition to switching\nnodes. We study the computational complexity of these extensions, building on\nexisting work on Reachability Switching Games. In particular, we show for\nversions of the arrival problem involving just switching and random nodes it is\n\\PP{}-hard to decide if their value is greater than a half and we give a PSPACE\ndecision algorithm.\n","authors":["Thomas Webster"],"pdf_url":"https://arxiv.org/pdf/2211.12982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03965v7","updated":"2024-09-15T10:06:31Z","published":"2023-04-08T09:32:22Z","title":"The n-vehicle exploration problem is NP-complete","summary":" The $n$-vehicle exploration problem (NVEP) is a nonlinear unconstrained\noptimization problem. Given a fleet of $n$ vehicles with mid-trip refueling\ntechnique, the NVEP tries to find a sequence of $n$ vehicles to make one of the\nvehicles travel the farthest, and at last all the vehicles return to the start\npoint. NVEP has a fractional form of objective function, and its computational\ncomplexity of general case remains open. Given a directed graph $G$, it can be\nreduced in polynomial time to an instance of NVEP. We prove that the graph $G$\nhas a hamiltonian path if and only if the reduced NVEP instance has a feasible\nsequence of length at least $n$. Therefore we show that Hamiltonian path\n$\\leq_P$ NVEP, and consequently prove that NVEP is NP-complete.\n","authors":["Jinchuan Cui","Xiaoya Li"],"pdf_url":"https://arxiv.org/pdf/2304.03965v7.pdf","comment":"5 pages, no figure"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2402.08434v4","updated":"2024-09-15T20:08:40Z","published":"2024-02-13T13:03:49Z","title":"Solving promise equations over monoids and groups","summary":" We give a complete complexity classification for the problem of finding a\nsolution to a given system of equations over a fixed finite monoid, given that\na solution over a more restricted monoid exists. As a corollary, we obtain a\ncomplexity classification for the same problem over groups.\n","authors":["Alberto Larrauri","Stanislav Živný"],"pdf_url":"https://arxiv.org/pdf/2402.08434v4.pdf","comment":"Full version of an ICALP 2024 paper"},{"id":"http://arxiv.org/abs/2107.10801v6","updated":"2024-09-15T22:36:20Z","published":"2021-07-22T16:56:07Z","title":"Specifying a Game-Theoretic Extensive Form as an Abstract 5-ary Relation","summary":" This paper specifies an extensive form as a 5-ary relation (that is, as a set\nof quintuples) which satisfies eight abstract axioms. Each quintuple is\nunderstood to list a player, a situation (that is, a name for an information\nset), a decision node, an action, and a successor node. Accordingly, the axioms\nare understood to specify abstract relationships between players, situations,\nnodes, and actions. Such an extensive form is called a \"pentaform\". Finally, a\n\"pentaform game\" is defined to be a pentaform together with utility functions.\n To ground this new specification in the literature, the paper defines the\nconcept of a \"traditional game\" to represent the literature's many\nspecifications of finite-horizon and infinite-horizon games. The paper's main\nresult is to construct an intuitive bijection between pentaform games and\ntraditional games. Secondary results concern disaggregating pentaforms by\nsubsets, constructing pentaforms by unions, and initial pentaform applications\nto Selten subgames and perfect-recall (an extensive application to dynamic\nprogramming is in Streufert 2023, arXiv:2302.03855).\n","authors":["Peter A. Streufert"],"pdf_url":"https://arxiv.org/pdf/2107.10801v6.pdf","comment":"53 pages, 9 figures. This version 6 makes small editorial changes.\n Version 5 had essentially the same results as Version 4, but with improved\n exposition and appendices. Version 4 merely updated cross-references, while\n Version 3 was extensively rewritten with new tools and applications. Version\n 1 is Western University Department of Economics Research Report 2021-3"},{"id":"http://arxiv.org/abs/2205.10995v3","updated":"2024-09-15T15:28:05Z","published":"2022-05-23T01:56:52Z","title":"From Width-Based Model Checking to Width-Based Automated Theorem Proving","summary":" In the field of parameterized complexity theory, the study of graph width\nmeasures has been intimately connected with the development of width-based\nmodel checking algorithms for combinatorial properties on graphs. In this work,\nwe introduce a general framework to convert a large class of width-based\nmodel-checking algorithms into algorithms that can be used to test the validity\nof graph-theoretic conjectures on classes of graphs of bounded width. Our\nframework is modular and can be applied with respect to several well-studied\nwidth measures for graphs, including treewidth and cliquewidth.\n As a quantitative application of our framework, we prove analytically that\nfor several long-standing graph-theoretic conjectures, there exists an\nalgorithm that takes a number $k$ as input and correctly determines in time\ndouble-exponential in $k^{O(1)}$ whether the conjecture is valid on all graphs\nof treewidth at most $k$. These upper bounds, which may be regarded as\nupper-bounds on the size of proofs/disproofs for these conjectures on the class\nof graphs of treewidth at most $k$, improve significantly on theoretical upper\nbounds obtained using previously available techniques.\n","authors":["Mateus de Oliveira Oliveira","Farhad Vadiee"],"pdf_url":"https://arxiv.org/pdf/2205.10995v3.pdf","comment":"A preliminary version of this work was published in the proceedings\n of AAAI 2023"}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2405.13347v2","updated":"2024-09-15T15:10:57Z","published":"2024-05-22T05:07:56Z","title":"Time-Series Forecasting and Sequence Learning Using Memristor-based\n Reservoir System","summary":" Pushing the frontiers of time-series information processing in the\never-growing domain of edge devices with stringent resources has been impeded\nby the systems' ability to process information and learn locally on the device.\nLocal processing and learning of time-series information typically demand\nintensive computations and massive storage as the process involves retrieving\ninformation and tuning hundreds of parameters back in time. In this work, we\ndeveloped a memristor-based echo state network accelerator that features\nefficient temporal data processing and in-situ online learning. The proposed\ndesign is benchmarked using various datasets involving real-world tasks, such\nas forecasting the load energy consumption and weather conditions. The\nexperimental results illustrate that the hardware model experiences a marginal\ndegradation in performance as compared to the software counterpart. This is\nmainly attributed to the limited precision and dynamic range of network\nparameters when emulated using memristor devices. The proposed system is\nevaluated for lifespan, robustness, and energy-delay product. It is observed\nthat the system demonstrates reasonable robustness for device failure below\n10%, which may occur due to stuck-at faults. Furthermore, 247X reduction in\nenergy consumption is achieved when compared to a custom CMOS digital design\nimplemented at the same technology node.\n","authors":["Abdullah M. Zyarah","Dhireesha Kudithipudi"],"pdf_url":"https://arxiv.org/pdf/2405.13347v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04910v2","updated":"2024-09-15T12:32:18Z","published":"2024-06-07T13:00:57Z","title":"PolyLUT-Add: FPGA-based LUT Inference with Wide Inputs","summary":" FPGAs have distinct advantages as a technology for deploying deep neural\nnetworks (DNNs) at the edge. Lookup Table (LUT) based networks, where neurons\nare directly modeled using LUTs, help maximize this promise of offering\nultra-low latency and high area efficiency on FPGAs. Unfortunately, LUT\nresource usage scales exponentially with the number of inputs to the LUT,\nrestricting PolyLUT to small LUT sizes. This work introduces PolyLUT-Add, a\ntechnique that enhances neuron connectivity by combining $A$ PolyLUT\nsub-neurons via addition to improve accuracy. Moreover, we describe a novel\narchitecture to improve its scalability. We evaluated our implementation over\nthe MNIST, Jet Substructure classification, and Network Intrusion Detection\nbenchmark and found that for similar accuracy, PolyLUT-Add achieves a LUT\nreduction of $2.0-13.9\\times$ with a $1.2-1.6\\times$ decrease in latency.\n","authors":["Binglei Lou","Richard Rademacher","David Boland","Philip H. W. Leong"],"pdf_url":"https://arxiv.org/pdf/2406.04910v2.pdf","comment":"The source code for this paper is available at:\n https://github.com/bingleilou/PolyLUT-Add"},{"id":"http://arxiv.org/abs/2409.09689v1","updated":"2024-09-15T10:52:21Z","published":"2024-09-15T10:52:21Z","title":"CAT: Customized Transformer Accelerator Framework on Versal ACAP","summary":" Transformer uses GPU as the initial design platform, but GPU can only perform\nlimited hardware customization. Although FPGA has strong customization ability,\nthe design solution space is huge and the design difficulty is high. Versal\nACAP is a heterogeneous computing architecture with AI Engine as the core. It\nis far more flexible than GPU in hardware customization, and has better and\nsmaller design solution space than traditional FPGA. Therefore, this paper\nproposes the Customized Transformer Accelerator Framework(CAT), through the CAT\nframework, a customized Transformer accelerator family can be derived on Versal\nACAP, CAT framework has an abstract accelerator architecture design idea, which\ndeconstructs and efficiently maps the Transformer into the hardware, which\ncontains a variety of customizable properties. Through the customization and\noptimization strategy of the CAT framework, the underlying hardware and the\nupper model jointly constrain and decide on these customizable properties, and\nfinally form a customized accelerator. We use a 7 nm AMD Versal ACAP VCK5000\ndevelopment board to implement accelerators for different Transformer models\nbased on the CAT framework. Experiments show that we achieve the highest\nthroughput gains of 2.41x, 49.50x, and 1.32x compared to 8 nm Nvidia GPU A10G,\n16 nm AMD FPGA ZCU102, and 7 nm AMD Versal ACAP VC190(SOTA). The highest energy\nefficiency gains are 7.80x, 6.19x and 1.15x, respectively.\n","authors":["Wenbo Zhang","Yiqi Liu","Zhenshan Bao"],"pdf_url":"https://arxiv.org/pdf/2409.09689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11437v1","updated":"2024-09-15T21:09:30Z","published":"2024-09-15T21:09:30Z","title":"Pack my weights and run! Minimizing overheads for in-memory computing\n accelerators","summary":" In-memory computing hardware accelerators allow more than 10x improvements in\npeak efficiency and performance for matrix-vector multiplications (MVM)\ncompared to conventional digital designs. For this, they have gained great\ninterest for the acceleration of neural network workloads. Nevertheless, these\npotential gains are only achieved when the utilization of the computational\nresources is maximized and the overhead from loading operands in the memory\narray minimized. To this aim, this paper proposes a novel mapping algorithm for\nthe weights in the IMC macro, based on efficient packing of the weights of\nnetwork layers in the available memory. The algorithm realizes 1) minimization\nof weight loading times while at the same time 2) maximally exploiting the\nparallelism of the IMC computational fabric. A set of case studies are carried\nout to show achievable trade-offs for the MLPerf Tiny benchmark\n\\cite{mlperftiny} on IMC architectures, with potential $10-100\\times$ EDP\nimprovements.\n","authors":["Pouya Houshmand","Marian Verhelst"],"pdf_url":"https://arxiv.org/pdf/2409.11437v1.pdf","comment":"7 pages, 9 figures"}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2306.13002v3","updated":"2024-09-15T23:09:35Z","published":"2023-06-22T15:59:04Z","title":"ACC Saturator: Automatic Kernel Optimization for Directive-Based GPU\n Code","summary":" Automatic code optimization is a complex process that typically involves the\napplication of multiple discrete algorithms that modify the program structure\nirreversibly. However, the design of these algorithms is often monolithic, and\nthey require repetitive implementation to perform similar analyses due to the\nlack of cooperation. To address this issue, modern optimization techniques,\nsuch as equality saturation, allow for exhaustive term rewriting at various\nlevels of inputs, thereby simplifying compiler design.\n In this paper, we propose equality saturation to optimize sequential codes\nutilized in directive-based programming for GPUs. Our approach realizes less\ncomputation, less memory access, and high memory throughput simultaneously. Our\nfully-automated framework constructs single-assignment forms from inputs to be\nentirely rewritten while keeping dependencies and extracts optimal cases.\nThrough practical benchmarks, we demonstrate a significant performance\nimprovement on several compilers. Furthermore, we highlight the advantages of\ncomputational reordering and emphasize the significance of memory-access order\nfor modern GPUs.\n","authors":["Kazuaki Matsumura","Simon Garcia De Gonzalo","Antonio J. Peña"],"pdf_url":"https://arxiv.org/pdf/2306.13002v3.pdf","comment":"To appear in: Proceedings of Eleventh Workshop on Accelerator\n Programming and Directives (WACCPD 2024)"},{"id":"http://arxiv.org/abs/2409.09887v1","updated":"2024-09-15T22:50:57Z","published":"2024-09-15T22:50:57Z","title":"Leiden-Fusion Partitioning Method for Effective Distributed Training of\n Graph Embeddings","summary":" In the area of large-scale training of graph embeddings, effective training\nframeworks and partitioning methods are critical for handling large networks.\nHowever, they face two major challenges: 1) existing synchronized distributed\nframeworks require continuous communication to access information from other\nmachines, and 2) the inability of current partitioning methods to ensure that\nsubgraphs remain connected components without isolated nodes, which is\nessential for effective training of GNNs since training relies on information\naggregation from neighboring nodes. To address these issues, we introduce a\nnovel partitioning method, named Leiden-Fusion, designed for large-scale\ntraining of graphs with minimal communication. Our method extends the Leiden\ncommunity detection algorithm with a greedy algorithm that merges the smallest\ncommunities with highly connected neighboring communities. Our method\nguarantees that, for an initially connected graph, each partition is a densely\nconnected subgraph with no isolated nodes. After obtaining the partitions, we\ntrain a GNN for each partition independently, and finally integrate all\nembeddings for node classification tasks, which significantly reduces the need\nfor network communication and enhances the efficiency of distributed graph\ntraining. We demonstrate the effectiveness of our method through extensive\nevaluations on several benchmark datasets, achieving high efficiency while\npreserving the quality of the graph embeddings for node classification tasks.\n","authors":["Yuhe Bai","Camelia Constantin","Hubert Naacke"],"pdf_url":"https://arxiv.org/pdf/2409.09887v1.pdf","comment":"Accepted at the 2024 European Conference on Machine Learning and\n Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD 2024)"},{"id":"http://arxiv.org/abs/2409.09874v1","updated":"2024-09-15T21:50:20Z","published":"2024-09-15T21:50:20Z","title":"The Landscape of GPU-Centric Communication","summary":" n recent years, GPUs have become the preferred accelerators for HPC and ML\napplications due to their parallelism and fast memory bandwidth. While GPUs\nboost computation, inter-GPU communication can create scalability bottlenecks,\nespecially as the number of GPUs per node and cluster grows. Traditionally, the\nCPU managed multi-GPU communication, but advancements in GPU-centric\ncommunication now challenge this CPU dominance by reducing its involvement,\ngranting GPUs more autonomy in communication tasks, and addressing mismatches\nin multi-GPU communication and computation.\n This paper provides a landscape of GPU-centric communication, focusing on\nvendor mechanisms and user-level library supports. It aims to clarify the\ncomplexities and diverse options in this field, define the terminology, and\ncategorize existing approaches within and across nodes. The paper discusses\nvendor-provided mechanisms for communication and memory management in multi-GPU\nexecution and reviews major communication libraries, their benefits,\nchallenges, and performance insights. Then, it explores key research paradigms,\nfuture outlooks, and open research questions. By extensively describing\nGPU-centric communication techniques across the software and hardware stacks,\nwe provide researchers, programmers, engineers, and library designers insights\non how to exploit multi-GPU systems at their best.\n","authors":["Didem Unat","Ilyas Turimbetov","Mohammed Kefah Taha Issa","Doğan Sağbili","Flavio Vella","Daniele De Sensi","Ismayil Ismayilov"],"pdf_url":"https://arxiv.org/pdf/2409.09874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09794v1","updated":"2024-09-15T17:04:25Z","published":"2024-09-15T17:04:25Z","title":"Federated Learning in Adversarial Environments: Testbed Design and\n Poisoning Resilience in Cybersecurity","summary":" This paper presents the design and implementation of a Federated Learning\n(FL) testbed, focusing on its application in cybersecurity and evaluating its\nresilience against poisoning attacks. Federated Learning allows multiple\nclients to collaboratively train a global model while keeping their data\ndecentralized, addressing critical needs for data privacy and security,\nparticularly in sensitive fields like cybersecurity. Our testbed, built using\nthe Flower framework, facilitates experimentation with various FL frameworks,\nassessing their performance, scalability, and ease of integration. Through a\ncase study on federated intrusion detection systems, we demonstrate the\ntestbed's capabilities in detecting anomalies and securing critical\ninfrastructure without exposing sensitive network data. Comprehensive poisoning\ntests, targeting both model and data integrity, evaluate the system's\nrobustness under adversarial conditions. Our results show that while federated\nlearning enhances data privacy and distributed learning, it remains vulnerable\nto poisoning attacks, which must be mitigated to ensure its reliability in\nreal-world applications.\n","authors":["Hao Jian Huang","Bekzod Iskandarov","Mizanur Rahman","Hakan T. Otal","M. Abdullah Canbaz"],"pdf_url":"https://arxiv.org/pdf/2409.09794v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.05904v2","updated":"2024-09-15T10:24:15Z","published":"2024-06-09T19:53:48Z","title":"Aegis: A Decentralized Expansion Blockchain","summary":" Blockchains implement monetary systems operated by committees of nodes. The\nrobustness of established blockchains presents an opportunity to leverage their\ninfrastructure for creating expansion chains. Expansion chains can provide\nadditional functionality to the primary chain they leverage or implement\nseparate functionalities, while benefiting from the primary chain's security\nand the stability of its tokens. Indeed, tools like Ethereum's EigenLayer\nenable nodes to stake (deposit collateral) on a primary chain to form a\ncommittee responsible for operating an expansion chain.\n But here is the rub. Classical protocols assume correct, well-behaved nodes\nstay correct indefinitely. Yet in our case, the stake incentivizes\ncorrectness--it will be slashed (revoked) if its owner deviates. Once a node\nwithdraws its stake, there is no basis to assume its correctness.\n To address the new challenge, we present Aegis, an expansion chain based on\nprimary-chain stake, assuming a bounded primary-chain write time. Aegis uses\nreferences from Aegis blocks to primary blocks to define committees,\ncheckpoints on the primary chain to perpetuate decisions, and resets on the\nprimary chain to establish a new committee if the previous one becomes\nobsolete. It ensures safety at all times and rapid progress when latency among\nAegis nodes is low.\n","authors":["Yogev Bar-On","Roi Bar-Zur","Omer Ben-Porat","Nimrod Cohen","Ittay Eyal","Matan Sitbon"],"pdf_url":"https://arxiv.org/pdf/2406.05904v2.pdf","comment":null}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2307.14471v3","updated":"2024-09-15T03:59:50Z","published":"2023-07-26T19:20:03Z","title":"Modal Abstractions for Virtualizing Memory Addresses","summary":" Operating system kernels employ virtual memory subsystems, which use a CPU's\nmemory management units (MMUs) to virtualize the addresses of memory regions\nOperating systems manipulate these virtualized memory mappings to isolate\nuntrusted processes, restrict which memory is accessible to different\nprocesses, hide memory limits from user programs, ensure process isolation,\nimplement demand-paging and copy-on-write behaviors for performance and\nresource controls.\n Virtual memory management (VMM) code is a critical piece of general-purpose\nOS kernels, but verification of this functionality is challenging due to the\ncomplexity of the hardware interface. In this paper, we introduce a modal\nabstraction to describe the truth of assertions relative to a specific virtual\naddress space: [r]P indicating that P holds in the virtual address space rooted\nat r. Such modal assertions allow different address spaces to refer to each\nother, enabling complete verification of instruction sequences manipulating\nmultiple address spaces. Using them effectively requires working with other\nassertions, such as points-to assertions in our separation logic, as relative\nto a given address space. We therefore define virtual points-to relations,\nwhich mimic hardware address translation, relative to a page table root. We\ndemonstrate our approach with challenging fragments of VMM code showing that\nour approach handles examples beyond what prior work can address, including\nreasoning about a sequence of instructions as it changes address spaces. All\ndefinitions and theorems mentioned in this paper including the operational\nmodel of a RISC-like fragment of x86-64, a simple language run on this\noperational model, and a logic as an instantiation of the Iris framework are\nmechanized inside Coq.\n","authors":["Ismail Kuru","Colin S. Gordon"],"pdf_url":"https://arxiv.org/pdf/2307.14471v3.pdf","comment":null}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2409.09874v1","updated":"2024-09-15T21:50:20Z","published":"2024-09-15T21:50:20Z","title":"The Landscape of GPU-Centric Communication","summary":" n recent years, GPUs have become the preferred accelerators for HPC and ML\napplications due to their parallelism and fast memory bandwidth. While GPUs\nboost computation, inter-GPU communication can create scalability bottlenecks,\nespecially as the number of GPUs per node and cluster grows. Traditionally, the\nCPU managed multi-GPU communication, but advancements in GPU-centric\ncommunication now challenge this CPU dominance by reducing its involvement,\ngranting GPUs more autonomy in communication tasks, and addressing mismatches\nin multi-GPU communication and computation.\n This paper provides a landscape of GPU-centric communication, focusing on\nvendor mechanisms and user-level library supports. It aims to clarify the\ncomplexities and diverse options in this field, define the terminology, and\ncategorize existing approaches within and across nodes. The paper discusses\nvendor-provided mechanisms for communication and memory management in multi-GPU\nexecution and reviews major communication libraries, their benefits,\nchallenges, and performance insights. Then, it explores key research paradigms,\nfuture outlooks, and open research questions. By extensively describing\nGPU-centric communication techniques across the software and hardware stacks,\nwe provide researchers, programmers, engineers, and library designers insights\non how to exploit multi-GPU systems at their best.\n","authors":["Didem Unat","Ilyas Turimbetov","Mohammed Kefah Taha Issa","Doğan Sağbili","Flavio Vella","Daniele De Sensi","Ismayil Ismayilov"],"pdf_url":"https://arxiv.org/pdf/2409.09874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09846v1","updated":"2024-09-15T19:53:30Z","published":"2024-09-15T19:53:30Z","title":"A Global Perspective on the Past, Present, and Future of Video Streaming\n over Starlink","summary":" This study presents the first global analysis of on-demand video streaming\nover Low Earth Orbit (LEO) satellite networks, using data from over one million\nhouseholds across 85 countries. We highlight Starlink's role as a major LEO\nprovider, enhancing connectivity in underserved regions. Our findings reveal\nthat while overall video quality on Starlink matches that of traditional\nnetworks, the inherent variability in LEO conditions -- such as throughput\nfluctuations and packet loss -- leads to an increase in bitrate switches and\nrebuffers. To further improve the quality of experience for the LEO community,\nwe manipulate existing congestion control and adaptive bitrate streaming\nalgorithms using simulation and real A/B tests deployed on over one million\nhouseholds. Our results underscore the need for video streaming and congestion\ncontrol algorithms to adapt to rapidly evolving network landscapes, ensuring\nhigh-quality service across diverse and dynamic network types.\n","authors":["Liz Izhikevich","Reese Enghardt","Te-Yuan Huang","Renata Teixeira"],"pdf_url":"https://arxiv.org/pdf/2409.09846v1.pdf","comment":null}],"Operation Systems":[{"id":"http://arxiv.org/abs/2409.09606v1","updated":"2024-09-15T04:11:26Z","published":"2024-09-15T04:11:26Z","title":"BULKHEAD: Secure, Scalable, and Efficient Kernel Compartmentalization\n with PKS","summary":" The endless stream of vulnerabilities urgently calls for principled\nmitigation to confine the effect of exploitation. However, the monolithic\narchitecture of commodity OS kernels, like the Linux kernel, allows an attacker\nto compromise the entire system by exploiting a vulnerability in any kernel\ncomponent. Kernel compartmentalization is a promising approach that follows the\nleast-privilege principle. However, existing mechanisms struggle with the\ntrade-off on security, scalability, and performance, given the challenges\nstemming from mutual untrustworthiness among numerous and complex components.\n In this paper, we present BULKHEAD, a secure, scalable, and efficient kernel\ncompartmentalization technique that offers bi-directional isolation for\nunlimited compartments. It leverages Intel's new hardware feature PKS to\nisolate data and code into mutually untrusted compartments and benefits from\nits fast compartment switching. With untrust in mind, BULKHEAD introduces a\nlightweight in-kernel monitor that enforces multiple important security\ninvariants, including data integrity, execute-only memory, and compartment\ninterface integrity. In addition, it provides a locality-aware two-level scheme\nthat scales to unlimited compartments. We implement a prototype system on Linux\nv6.1 to compartmentalize loadable kernel modules (LKMs). Extensive evaluation\nconfirms the effectiveness of our approach. As the system-wide impacts,\nBULKHEAD incurs an average performance overhead of 2.44% for real-world\napplications with 160 compartmentalized LKMs. While focusing on a specific\ncompartment, ApacheBench tests on ipv6 show an overhead of less than 2%.\nMoreover, the performance is almost unaffected by the number of compartments,\nwhich makes it highly scalable.\n","authors":["Yinggang Guo","Zicheng Wang","Weiheng Bai","Qingkai Zeng","Kangjie Lu"],"pdf_url":"https://arxiv.org/pdf/2409.09606v1.pdf","comment":"Accepted to appear in NDSS'25"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2409.09889v1","updated":"2024-09-15T22:53:56Z","published":"2024-09-15T22:53:56Z","title":"Well-Behaved (Co)algebraic Semantics of Regular Expressions in Dafny","summary":" Regular expressions are commonly understood in terms of their denotational\nsemantics, that is, through formal languages -- the regular languages. This\nview is inductive in nature: two primitives are equivalent if they are\nconstructed in the same way. Alternatively, regular expressions can be\nunderstood in terms of their operational semantics, that is, through\ndeterministic finite automata. This view is coinductive in nature: two\nprimitives are equivalent if they are deconstructed in the same way. It is\nimplied by Kleene's famous theorem that both views are equivalent: regular\nlanguages are precisely the formal languages accepted by deterministic finite\nautomata. In this paper, we use Dafny, a verification-aware programming\nlanguage, to formally verify, for the first time, what has been previously\nestablished only through proofs-by-hand: the two semantics of regular\nexpressions are well-behaved, in the sense that they are in fact one and the\nsame, up to pointwise bisimilarity. At each step of our formalisation, we\npropose an interpretation in the language of Coalgebra. We found that Dafny is\nparticularly well suited for the task due to its inductive and coinductive\nfeatures and hope our approach serves as a blueprint for future generalisations\nto other theories.\n","authors":["Stefan Zetzsche","Wojciech Rozowski"],"pdf_url":"https://arxiv.org/pdf/2409.09889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09769v1","updated":"2024-09-15T15:37:38Z","published":"2024-09-15T15:37:38Z","title":"Risk-Aware Autonomous Driving for Linear Temporal Logic Specifications","summary":" Decision-making for autonomous driving incorporating different types of risks\nis a challenging topic. This paper proposes a novel risk metric to facilitate\nthe driving task specified by linear temporal logic (LTL) by balancing the risk\nbrought up by different uncertain events. Such a balance is achieved by\ndiscounting the costs of these uncertain events according to their timing and\nseverity, thereby reflecting a human-like awareness of risk. We have\nestablished a connection between this risk metric and the occupation measure, a\nfundamental concept in stochastic reachability problems, such that a risk-aware\ncontrol synthesis problem under LTL specifications is formulated for autonomous\nvehicles using occupation measures. As a result, the synthesized policy\nachieves balanced decisions across different types of risks with associated\ncosts, showcasing advantageous versatility and generalizability. The\neffectiveness and scalability of the proposed approach are validated by three\ntypical traffic scenarios in Carla simulator.\n","authors":["Shuhao Qi","Zengjie Zhang","Zhiyong Sun","Sofie Haesaert"],"pdf_url":"https://arxiv.org/pdf/2409.09769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.10995v3","updated":"2024-09-15T15:28:05Z","published":"2022-05-23T01:56:52Z","title":"From Width-Based Model Checking to Width-Based Automated Theorem Proving","summary":" In the field of parameterized complexity theory, the study of graph width\nmeasures has been intimately connected with the development of width-based\nmodel checking algorithms for combinatorial properties on graphs. In this work,\nwe introduce a general framework to convert a large class of width-based\nmodel-checking algorithms into algorithms that can be used to test the validity\nof graph-theoretic conjectures on classes of graphs of bounded width. Our\nframework is modular and can be applied with respect to several well-studied\nwidth measures for graphs, including treewidth and cliquewidth.\n As a quantitative application of our framework, we prove analytically that\nfor several long-standing graph-theoretic conjectures, there exists an\nalgorithm that takes a number $k$ as input and correctly determines in time\ndouble-exponential in $k^{O(1)}$ whether the conjecture is valid on all graphs\nof treewidth at most $k$. These upper bounds, which may be regarded as\nupper-bounds on the size of proofs/disproofs for these conjectures on the class\nof graphs of treewidth at most $k$, improve significantly on theoretical upper\nbounds obtained using previously available techniques.\n","authors":["Mateus de Oliveira Oliveira","Farhad Vadiee"],"pdf_url":"https://arxiv.org/pdf/2205.10995v3.pdf","comment":"A preliminary version of this work was published in the proceedings\n of AAAI 2023"}]},"2024-09-17T00:00:00Z":{"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2306.13002v4","updated":"2024-09-17T07:29:35Z","published":"2023-06-22T15:59:04Z","title":"ACC Saturator: Automatic Kernel Optimization for Directive-Based GPU\n Code","summary":" Automatic code optimization is a complex process that typically involves the\napplication of multiple discrete algorithms that modify the program structure\nirreversibly. However, the design of these algorithms is often monolithic, and\nthey require repetitive implementation to perform similar analyses due to the\nlack of cooperation. To address this issue, modern optimization techniques,\nsuch as equality saturation, allow for exhaustive term rewriting at various\nlevels of inputs, thereby simplifying compiler design.\n In this paper, we propose equality saturation to optimize sequential codes\nutilized in directive-based programming for GPUs. Our approach realizes less\ncomputation, less memory access, and high memory throughput simultaneously. Our\nfully-automated framework constructs single-assignment forms from inputs to be\nentirely rewritten while keeping dependencies and extracts optimal cases.\nThrough practical benchmarks, we demonstrate a significant performance\nimprovement on several compilers. Furthermore, we highlight the advantages of\ncomputational reordering and emphasize the significance of memory-access order\nfor modern GPUs.\n","authors":["Kazuaki Matsumura","Simon Garcia De Gonzalo","Antonio J. Peña"],"pdf_url":"https://arxiv.org/pdf/2306.13002v4.pdf","comment":"To appear in: Proceedings of Eleventh Workshop on Accelerator\n Programming and Directives (WACCPD 2024)"},{"id":"http://arxiv.org/abs/2409.11392v1","updated":"2024-09-17T17:50:52Z","published":"2024-09-17T17:50:52Z","title":"Temporal Load Imbalance on Ondes3D Seismic Simulator for Different\n Multicore Architectures","summary":" The variety of today's multicore architectures motivates researchers to\nexplore parallel scientific applications on different platforms. Load imbalance\nis one performance issue that can prejudice parallel applications from\nexploiting the computational power of these platforms. Ondes3D is a scientific\napplication for seismic wave simulation used to assess the geological impact of\nearthquakes. Its parallelism relies on applying a regular domain decomposition\nin the geological domain provided and distributing each sub-domain to MPI\nranks. Previous works investigate the significant spatial and temporal\nimbalance in Ondes3D and suggest new parallelization and load balancing\ntechniques to minimize them. However, none explored its execution on different\narchitectures. Our paper evaluates the performance of Ondes3D for two\nearthquake scenarios on eight different multicore architectures, including\nIntel, AMD, and ARM processors. We measure the load distribution per MPI rank,\nevaluate the temporal load imbalance, and compare the execution of the\napplication's kernels. Our results show that the temporal load imbalance in\nOndes3D depends on the architecture chosen, with some platforms minimizing such\nimbalance more effectively.\n","authors":["Ana Luisa Veroneze Solórzano","Philippe Olivier Alexandre Navaux","Lucas Mello Schnorr"],"pdf_url":"https://arxiv.org/pdf/2409.11392v1.pdf","comment":"The 2020 International Conference on High Performance Computing and\n Simulation (HPCS 2020)"},{"id":"http://arxiv.org/abs/2409.11304v1","updated":"2024-09-17T15:59:47Z","published":"2024-09-17T15:59:47Z","title":"Communication Lower Bounds and Optimal Algorithms for Symmetric Matrix\n Computations","summary":" In this article, we focus on the communication costs of three symmetric\nmatrix computations: i) multiplying a matrix with its transpose, known as a\nsymmetric rank-k update (SYRK) ii) adding the result of the multiplication of a\nmatrix with the transpose of another matrix and the transpose of that result,\nknown as a symmetric rank-2k update (SYR2K) iii) performing matrix\nmultiplication with a symmetric input matrix (SYMM). All three computations\nappear in the Level 3 Basic Linear Algebra Subroutines (BLAS) and have wide use\nin applications involving symmetric matrices. We establish communication lower\nbounds for these kernels using sequential and distributed-memory parallel\ncomputational models, and we show that our bounds are tight by presenting\ncommunication-optimal algorithms for each setting. Our lower bound proofs rely\non applying a geometric inequality for symmetric computations and analytically\nsolving constrained nonlinear optimization problems. The symmetric matrix and\nits corresponding computations are accessed and performed according to a\ntriangular block partitioning scheme in the optimal algorithms.\n","authors":["Hussam Al Daas","Grey Ballard","Laura Grigori","Suraj Kumar","Kathryn Rouse","Mathieu Verite"],"pdf_url":"https://arxiv.org/pdf/2409.11304v1.pdf","comment":"43 pages, 6 figures. To be published in ACM Transactions on Parallel\n Computing"},{"id":"http://arxiv.org/abs/2311.08776v2","updated":"2024-09-17T15:50:27Z","published":"2023-11-15T08:44:44Z","title":"Context Adaptive Cooperation","summary":" As shown by Reliable Broadcast and Consensus, cooperation among a set of\nindependent computing entities (sequential processes) is a central issue in\ndistributed computing. Considering $n$-process asynchronous message-passing\nsystems where some processes can be Byzantine, this paper introduces a new\ncooperation abstraction denoted Context-Adaptive Cooperation (CAC). While\nReliable Broadcast is a one-to-$n$ cooperation abstraction and Consensus is an\n$n$-to-$n$ cooperation abstraction, CAC is a $d$-to-$n$ cooperation abstraction\nwhere the parameter $d$ ($1\\leq d\\leq n$) depends on the run and remains\nunknown to the processes. Moreover, the correct processes accept the same set\nof $\\ell$ pairs $\\langle v,i\\rangle$ ($v$ is the value proposed by $p_i$) from\nthe $d$ proposer processes, where $1 \\leq \\ell \\leq d$ and, as $d$, $\\ell$\nremains unknown to the processes (except in specific cases). Those $\\ell$\nvalues are accepted one at a time in different orders at each process.\nFurthermore, CAC provides the processes with an imperfect oracle that gives\ninformation about the values that they may accept in the future. In a very\ninteresting way, the CAC abstraction is particularly efficient in favorable\ncircumstances. To illustrate its practical use, the paper describes in detail\ntwo applications that benefit from the abstraction: a fast consensus\nimplementation under low contention (named Cascading Consensus), and a novel\nnaming problem.\n","authors":["Timothé Albouy","Davide Frey","Mathieu Gestin","Michel Raynal","François Taïani"],"pdf_url":"https://arxiv.org/pdf/2311.08776v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11240v1","updated":"2024-09-17T14:42:49Z","published":"2024-09-17T14:42:49Z","title":"Federated Learning with Integrated Sensing, Communication, and\n Computation: Frameworks and Performance Analysis","summary":" With the emergence of integrated sensing, communication, and computation\n(ISCC) in the upcoming 6G era, federated learning with ISCC (FL-ISCC),\nintegrating sample collection, local training, and parameter exchange and\naggregation, has garnered increasing interest for enhancing training\nefficiency. Currently, FL-ISCC primarily includes two algorithms: FedAVG-ISCC\nand FedSGD-ISCC. However, the theoretical understanding of the performance and\nadvantages of these algorithms remains limited. To address this gap, we\ninvestigate a general FL-ISCC framework, implementing both FedAVG-ISCC and\nFedSGD-ISCC. We experimentally demonstrate the substantial potential of the\nISCC framework in reducing latency and energy consumption in FL. Furthermore,\nwe provide a theoretical analysis and comparison. The results reveal that:1)\nBoth sample collection and communication errors negatively impact algorithm\nperformance, highlighting the need for careful design to optimize FL-ISCC\napplications. 2) FedAVG-ISCC performs better than FedSGD-ISCC under IID data\ndue to its advantage with multiple local updates. 3) FedSGD-ISCC is more robust\nthan FedAVG-ISCC under non-IID data, where the multiple local updates in\nFedAVG-ISCC worsen performance as non-IID data increases. FedSGD-ISCC maintains\nperformance levels similar to IID conditions. 4) FedSGD-ISCC is more resilient\nto communication errors than FedAVG-ISCC, which suffers from significant\nperformance degradation as communication errors increase.Extensive simulations\nconfirm the effectiveness of the FL-ISCC framework and validate our theoretical\nanalysis.\n","authors":["Yipeng Liang","Qimei Chen","Hao Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.11240v1.pdf","comment":"due to the limitation The abstract field cannot be longer than 1,920\n characters\", the abstract appearing here is slightly shorter than that in the\n PDF file"},{"id":"http://arxiv.org/abs/2409.11208v1","updated":"2024-09-17T14:02:58Z","published":"2024-09-17T14:02:58Z","title":"Energy Efficiency Support for Software Defined Networks: a Serverless\n Computing Approach","summary":" Automatic network management strategies have become paramount for meeting the\nneeds of innovative real-time and data-intensive applications, such as in the\nInternet of Things. However, meeting the ever-growing and fluctuating demands\nfor data and services in such applications requires more than ever an efficient\nand scalable network resource management approach. Such approach should enable\nthe automated provisioning of services while incentivising energy-efficient\nresource usage that expands throughout the edge-to-cloud continuum. This paper\nis the first to realise the concept of modular Software-Defined Networks based\non serverless functions in an energy-aware environment. By adopting Function as\na Service, the approach enables on-demand deployment of network functions,\nresulting in cost reduction through fine resource provisioning granularity. An\nanalytical model is presented to approximate the service delivery time and\npower consumption, as well as an open-source prototype implementation supported\nby an extensive experimental evaluation. The experiments demonstrate not only\nthe practical applicability of the proposed approach but significant\nimprovement in terms of energy efficiency.\n","authors":["Fatemeh Banaie","Karim Djemame","Abdulaziz Alhindi","Vasilios Kelefouras"],"pdf_url":"https://arxiv.org/pdf/2409.11208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11068v1","updated":"2024-09-17T10:49:45Z","published":"2024-09-17T10:49:45Z","title":"A Reinforcement Learning Environment for Automatic Code Optimization in\n the MLIR Compiler","summary":" Code optimization is a crucial task aimed at enhancing code performance.\nHowever, this process is often tedious and complex, highlighting the necessity\nfor automatic code optimization techniques. Reinforcement Learning (RL), a\nmachine learning technique, has emerged as a promising approach for tackling\nsuch complex optimization problems. In this project, we introduce the first RL\nenvironment for the MLIR compiler, dedicated to facilitating MLIR compiler\nresearch, and enabling automatic code optimization using Multi-Action\nReinforcement Learning. We also propose a novel formulation of the action space\nas a Cartesian product of simpler action subspaces, enabling more efficient and\neffective optimizations. Experimental results demonstrate that our proposed\nenvironment allows for an effective optimization of MLIR operations, and yields\ncomparable performance to TensorFlow, surpassing it in multiple cases,\nhighlighting the potential of RL-based optimization in compiler frameworks.\n","authors":["Nazim Bendib","Iheb Nassim Aouadj","Riyadh Baghdadi"],"pdf_url":"https://arxiv.org/pdf/2409.11068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11043v1","updated":"2024-09-17T10:06:19Z","published":"2024-09-17T10:06:19Z","title":"Delay Analysis of EIP-4844","summary":" Proto-Danksharding, proposed in Ethereum Improvement Proposal 4844\n(EIP-4844), aims to incrementally improve the scalability of the Ethereum\nblockchain by introducing a new type of transaction known as blob-carrying\ntransactions. These transactions incorporate binary large objects (blobs) of\ndata that are stored off-chain but referenced and verified on-chain to ensure\ndata availability. By decoupling data availability from transaction execution,\nProto-Danksharding alleviates network congestion and reduces gas fees, laying\nthe groundwork for future, more advanced sharding solutions. This letter\nprovides an analytical model to derive the delay for these new transactions. We\nmodel the system as an $\\mathrm{M/D}^B/1$ queue which we then find its steady\nstate distribution through embedding a Markov chain and use of supplementary\nvariable method. We show that transactions with more blobs but less frequent\nimpose higher delays on the system compared to lower blobs but more frequent.\n","authors":["Pourya Soltani","Farid Ashtiani"],"pdf_url":"https://arxiv.org/pdf/2409.11043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10954v1","updated":"2024-09-17T07:44:04Z","published":"2024-09-17T07:44:04Z","title":"Ladon: High-Performance Multi-BFT Consensus via Dynamic Global Ordering\n (Extended Version)","summary":" Multi-BFT consensus runs multiple leader-based consensus instances in\nparallel, circumventing the leader bottleneck of a single instance. However, it\ncontains an Achilles' heel: the need to globally order output blocks across\ninstances. Deriving this global ordering is challenging because it must cope\nwith different rates at which blocks are produced by instances. Prior Multi-BFT\ndesigns assign each block a global index before creation, leading to poor\nperformance.\n We propose Ladon, a high-performance Multi-BFT protocol that allows varying\ninstance block rates. Our key idea is to order blocks across instances\ndynamically, which eliminates blocking on slow instances. We achieve dynamic\nglobal ordering by assigning monotonic ranks to blocks. We pipeline rank\ncoordination with the consensus process to reduce protocol overhead and combine\naggregate signatures with rank information to reduce message complexity.\nLadon's dynamic ordering enables blocks to be globally ordered according to\ntheir generation, which respects inter-block causality. We implemented and\nevaluated Ladon by integrating it with both PBFT and HotStuff protocols. Our\nevaluation shows that Ladon-PBFT (resp., Ladon-HotStuff) improves the peak\nthroughput of the prior art by $\\approx$8x (resp., 2x) and reduces latency by\n$\\approx$62% (resp., 23%), when deployed with one straggling replica (out of\n128 replicas) in a WAN setting.\n","authors":["Hanzheng Lyu","Shaokang Xie","Jianyu Niu","Chen Feng","Yinqian Zhang","Ivan Beschastnikh"],"pdf_url":"https://arxiv.org/pdf/2409.10954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10946v1","updated":"2024-09-17T07:28:56Z","published":"2024-09-17T07:28:56Z","title":"Skip TLB flushes for reused pages within mmap's","summary":" Memory access efficiency is significantly enhanced by caching recent address\ntranslations in the CPUs' Translation Lookaside Buffers (TLBs). However, since\nthe operating system is not aware of which core is using a particular mapping,\nit flushes TLB entries across all cores where the application runs whenever\naddresses are unmapped, ensuring security and consistency. These TLB flushes,\nknown as TLB shootdowns, are costly and create a performance and scalability\nbottleneck. A key contributor to TLB shootdowns is memory-mapped I/O,\nparticularly during mmap-munmap cycles and page cache evictions. Often, the\nsame physical pages are reassigned to the same process post-eviction,\npresenting an opportunity for the operating system to reduce the frequency of\nTLB shootdowns. We demonstrate, that by slightly extending the mmap function,\nTLB shootdowns for these \"recycled pages\" can be avoided.\n Therefore we introduce and implement the \"fast page recycling\" (FPR) feature\nwithin the mmap system call. FPR-mmaps maintain security by only triggering TLB\nshootdowns when a page exits its recycling cycle and is allocated to a\ndifferent process. To ensure consistency when FPR-mmap pointers are used, we\nmade minor adjustments to virtual memory management to avoid the ABA problem.\nUnlike previous methods to mitigate shootdown effects, our approach does not\nrequire any hardware modifications and operates transparently within the\nexisting Linux virtual memory framework.\n Our evaluations across a variety of CPU, memory, and storage setups,\nincluding persistent memory and Optane SSDs, demonstrate that FPR delivers\nnotable performance gains, with improvements of up to 28% in real-world\napplications and 92% in micro-benchmarks. Additionally, we show that TLB\nshootdowns are a significant source of bottlenecks, previously misattributed to\nother components of the Linux kernel.\n","authors":["Frederic Schimmelpfennig","André Brinkmann","Hossein Asadi","Reza Salkhordeh"],"pdf_url":"https://arxiv.org/pdf/2409.10946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10839v1","updated":"2024-09-17T02:08:52Z","published":"2024-09-17T02:08:52Z","title":"Dynamic DAG-Application Scheduling for Multi-Tier Edge Computing in\n Heterogeneous Networks","summary":" Edge computing is deemed a promising technique to execute latency-sensitive\napplications by offloading computation-intensive tasks to edge servers.\nExtensive research has been conducted in the field of end-device to edge server\ntask offloading for several goals, including latency minimization, energy\noptimization, and resource optimization. However, few of them consider our\nmobile computing devices (smartphones, tablets, and laptops) to be edge\ndevices. In this paper, we propose a novel multi-tier edge computing framework,\nwhich we refer to as M-TEC, that aims to optimize latency, reduce the\nprobability of failure, and optimize cost while accounting for the sporadic\nfailure of personally owned devices and the changing network conditions. We\nconduct experiments with a real testbed and a real commercial CBRS 4G network,\nand the results indicate that M-TEC is capable of reducing the end-to-end\nlatency of applications by at least 8\\% compared to the best baseline under a\nvariety of network conditions, while providing reliable performance at an\naffordable cost.\n","authors":["Xiang Li","Mustafa Abdallah","Yuan-Yao Lou","Mung Chiang","Kwang Taik Kim","Saurabh Bagchi"],"pdf_url":"https://arxiv.org/pdf/2409.10839v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2409.11592v1","updated":"2024-09-17T22:52:44Z","published":"2024-09-17T22:52:44Z","title":"CountChain: A Decentralized Oracle Network for Counting Systems","summary":" Blockchain integration in industries like online advertising is hindered by\nits connectivity limitations to off-chain data. These industries heavily rely\non precise counting systems for collecting and analyzing off-chain data. This\nrequires mechanisms, often called oracles, to feed off-chain data into smart\ncontracts. However, current oracle solutions are ill-suited for counting\nsystems since the oracles do not know when to expect the data, posing a\nsignificant challenge.\n To address this, we present CountChain, a decentralized oracle network for\ncounting systems. In CountChain, data is received by all oracle nodes, and any\nnode can submit a proposition request. Each proposition contains enough data to\nevaluate the occurrence of an event. Only randomly selected nodes participate\nin a game to evaluate the truthfulness of each proposition by providing proof\nand some stake. Finally, the propositions with the outcome of True increment\nthe counter in a smart contract. Thus, instead of a contract calling oracles\nfor data, in CountChain, the oracles call a smart contract when the data is\navailable. Furthermore, we present a formal analysis and experimental\nevaluation of the system's parameters on over half a million data points to\nobtain optimal system parameters. In such conditions, our game-theoretical\nanalysis demonstrates that a Nash equilibrium exists wherein all rational\nparties participate with honesty.\n","authors":["Behkish Nassirzadeh","Stefanos Leonardos","Albert Heinle","Anwar Hasan","Vijay Ganesh"],"pdf_url":"https://arxiv.org/pdf/2409.11592v1.pdf","comment":"being published at https://ieee-cybermatics.org/2024/blockchain/"},{"id":"http://arxiv.org/abs/2409.11585v1","updated":"2024-09-17T22:20:26Z","published":"2024-09-17T22:20:26Z","title":"Advances in APPFL: A Comprehensive and Extensible Federated Learning\n Framework","summary":" Federated learning (FL) is a distributed machine learning paradigm enabling\ncollaborative model training while preserving data privacy. In today's\nlandscape, where most data is proprietary, confidential, and distributed, FL\nhas become a promising approach to leverage such data effectively, particularly\nin sensitive domains such as medicine and the electric grid. Heterogeneity and\nsecurity are the key challenges in FL, however; most existing FL frameworks\neither fail to address these challenges adequately or lack the flexibility to\nincorporate new solutions. To this end, we present the recent advances in\ndeveloping APPFL, an extensible framework and benchmarking suite for federated\nlearning, which offers comprehensive solutions for heterogeneity and security\nconcerns, as well as user-friendly interfaces for integrating new algorithms or\nadapting to new applications. We demonstrate the capabilities of APPFL through\nextensive experiments evaluating various aspects of FL, including communication\nefficiency, privacy preservation, computational performance, and resource\nutilization. We further highlight the extensibility of APPFL through case\nstudies in vertical, hierarchical, and decentralized FL. APPFL is open-sourced\nat https://github.com/APPFL/APPFL.\n","authors":["Zilinghan Li","Shilan He","Ze Yang","Minseok Ryu","Kibaek Kim","Ravi Madduri"],"pdf_url":"https://arxiv.org/pdf/2409.11585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17951v2","updated":"2024-09-17T19:14:33Z","published":"2024-06-25T21:57:26Z","title":"Navigating High-Degree Heterogeneity: Federated Learning in Aerial and\n Space Networks","summary":" Federated learning offers a compelling solution to the challenges of\nnetworking and data privacy within aerial and space networks by utilizing vast\nprivate edge data and computing capabilities accessible through drones,\nballoons, and satellites. While current research has focused on optimizing the\nlearning process, computing efficiency, and minimizing communication overhead,\nthe heterogeneity issue and class imbalance remain a significant barrier to\nrapid model convergence. In this paper, we explore the influence of\nheterogeneity on class imbalance, which diminishes performance in Aerial and\nSpace Networks (ASNs)-based federated learning. We illustrate the correlation\nbetween heterogeneity and class imbalance within grouped data and show how\nconstraints such as battery life exacerbate the class imbalance challenge. Our\nfindings indicate that ASNs-based FL faces heightened class imbalance issues\neven with similar levels of heterogeneity compared to other scenarios. Finally,\nwe analyze the impact of varying degrees of heterogeneity on FL training and\nevaluate the efficacy of current state-of-the-art algorithms under these\nconditions. Our results reveal that the heterogeneity challenge is more\npronounced in ASNs-based federated learning and that prevailing algorithms\noften fail to effectively address high levels of heterogeneity.\n","authors":["Fan Dong","Henry Leung","Steve Drew"],"pdf_url":"https://arxiv.org/pdf/2406.17951v2.pdf","comment":"Accepted by IEEE 10th World Forum on Internet of Things"}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2409.01804v2","updated":"2024-09-17T16:31:26Z","published":"2024-09-03T11:37:30Z","title":"Strengthening Solidity Invariant Generation: From Post- to\n Pre-Deployment","summary":" Invariants are essential for ensuring the security and correctness of\nSolidity smart contracts, particularly in the context of blockchain's\nimmutability and decentralized execution. This paper introduces InvSol, a novel\nframework for pre-deployment invariant generation tailored specifically for\nSolidity smart contracts. Unlike existing solutions, namely InvCon, InvCon+,\nand Trace2Inv, that rely on post-deployment transaction histories on Ethereum\nmainnet, InvSol identifies invariants before deployment and offers\ncomprehensive coverage of Solidity language constructs, including loops.\nAdditionally, InvSol incorporates custom templates to effectively prevent\ncritical issues such as reentrancy, out-of-gas errors, and exceptions during\ninvariant generation. We rigorously evaluate InvSol using a benchmark set of\nsmart contracts and compare its performance with state-of-the-art solutions.\nOur findings reveal that InvSol significantly outperforms these tools,\ndemonstrating its effectiveness in handling new contracts with limited\ntransaction histories. Notably, InvSol achieves a 15% improvement in\nidentifying common vulnerabilities compared to InvCon+ and is able to address\ncertain crucial vulnerabilities using specific invariant templates, better than\nTrace2Inv.\n","authors":["Kartik Kaushik","Raju Halder","Samrat Mondal"],"pdf_url":"https://arxiv.org/pdf/2409.01804v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.15453v4","updated":"2024-09-17T13:01:21Z","published":"2021-03-29T09:37:30Z","title":"Disentangling Parallelism and Interference in Game Semantics","summary":" Game semantics is a denotational semantics presenting compositionally the\ncomputational behaviour of various kinds of effectful programs. One of its\ncelebrated achievement is to have obtained full abstraction results for\nprogramming languages with a variety of computational effects, in a single\nframework. This is known as the semantic cube or Abramsky's cube, which for\nsequential deterministic programs establishes a correspondence between certain\nconditions on strategies (''innocence'', ''well-bracketing'', ''visibility'')\nand the absence of matching computational effects. Outside of the sequential\ndeterministic realm, there are still a wealth of game semantics-based full\nabstraction results; but they no longer fit in a unified canvas. In particular,\nGhica and Murawski's fully abstract model for shared state concurrency (IA)\ndoes not have a matching notion of pure parallel program-we say that\nparallelism and interference (i.e. state plus semaphores) are entangled. In\nthis paper we construct a causal version of Ghica and Murawski's model, also\nfully abstract for IA. We provide compositional conditions parallel innocence\nand sequentiality, respectively banning interference and parallelism, and\nleading to four full abstraction results. To our knowledge, this is the first\nextension of Abramsky's semantic cube programme beyond the sequential\ndeterministic world.\n","authors":["Simon Castellan","Pierre Clairambault"],"pdf_url":"https://arxiv.org/pdf/2103.15453v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16075v2","updated":"2024-09-17T16:52:15Z","published":"2024-04-24T01:33:07Z","title":"Validating Traces of Distributed Programs Against TLA+ Specifications","summary":" TLA+ is a formal language for specifying systems, including distributed\nalgorithms, that is supported by powerful verification tools. In this work we\npresent a framework for relating traces of distributed programs to high-level\nspecifications written in TLA+. The problem is reduced to a constrained model\nchecking problem, realized using the TLC model checker. Our framework consists\nof an API for instrumenting Java programs in order to record traces of\nexecutions, of a collection of TLA+ operators that are used for relating those\ntraces to specifications, and of scripts for running the model checker.\nCrucially, traces only contain updates to specification variables rather than\nfull values, and developers may choose to trace only certain variables. We have\napplied our approach to several distributed programs, detecting discrepancies\nbetween the specifications and the implementations in all cases. We discuss\nreasons for these discrepancies, best practices for instrumenting programs, and\nhow to interpret the verdict produced by TLC.\n","authors":["Horatiu Cirstea","Markus A. Kuppe","Benjamin Loillier","Stephan Merz"],"pdf_url":"https://arxiv.org/pdf/2404.16075v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13322v2","updated":"2024-09-17T16:29:03Z","published":"2023-12-20T15:11:06Z","title":"MonoCoder: Domain-Specific Code Language Model for HPC Codes and Tasks","summary":" With easier access to powerful compute resources, there is a growing trend in\nAI for software development to develop large language models (LLMs) to address\na variety of programming tasks. Even LLMs applied to tasks from the\nhigh-performance computing (HPC) domain are huge in size and demand expensive\ncompute resources for training. This is partly because LLMs for HPC tasks are\nobtained by finetuning existing LLMs that support several natural and/or\nprogramming languages. We found this design choice confusing - why do we need\nLLMs trained on natural languages and programming languages unrelated to HPC\nfor HPC-specific tasks? In this line of work, we aim to question choices made\nby existing LLMs by developing smaller language models (LMs) for specific\ndomains - we call them domain-specific LMs. Specifically, we start with HPC as\na domain and build an HPC-specific LM, named MonoCoder, which is orders of\nmagnitude smaller than existing LMs but delivers better performance on non-HPC\nand HPC codes. Specifically, we pre-trained MonoCoder on an HPC-specific\ndataset (named HPCorpus) of C and C++ programs mined from GitHub. We evaluated\nthe performance of MonoCoder against state-of-the-art multi-lingual LLMs.\nResults demonstrate that MonoCoder, although much smaller than existing LMs,\noutperforms other LLMs on normalized-perplexity tests (in relation to model\nsize) while also delivering competing CodeBLEU scores for high-performance and\nparallel code generations. In other words, results suggest that MonoCoder\nunderstands HPC code better than state-of-the-art LLMs.\n","authors":["Tal Kadosh","Niranjan Hasabnis","Vy A. Vo","Nadav Schneider","Neva Krien","Mihai Capota","Abdul Wasay","Nesreen Ahmed","Ted Willke","Guy Tamir","Yuval Pinter","Timothy Mattson","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2312.13322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11157v1","updated":"2024-09-17T13:10:38Z","published":"2024-09-17T13:10:38Z","title":"The Incredible Shrinking Context... in a decompiler near you","summary":" Decompilation of binary code has arisen as a highly-important application in\nthe space of Ethereum VM (EVM) smart contracts. Major new decompilers appear\nnearly every year and attain popularity, for a multitude of reverse-engineering\nor tool-building purposes. Technically, the problem is fundamental: it consists\nof recovering high-level control flow from a highly-optimized\ncontinuation-passing-style (CPS) representation. Architecturally, decompilers\ncan be built using either static analysis or symbolic execution techniques.\n We present Shrknr, a static-analysis-based decompiler succeeding the\nstate-of-the-art Elipmoc decompiler. Shrknr manages to achieve drastic\nimprovements relative to the state of the art, in all significant dimensions:\nscalability, completeness, precision. Chief among the techniques employed is a\nnew variant of static analysis context: shrinking context sensitivity.\nShrinking context sensitivity performs deep cuts in the static analysis\ncontext, eagerly \"forgetting\" control-flow history, in order to leave room for\nfurther precise reasoning.\n We compare Shrnkr to state-of-the-art decompilers, both static-analysis- and\nsymbolic-execution-based. In a standard benchmark set, Shrnkr scales to over\n99.5% of contracts (compared to ~95%), covers (i.e., reaches and manages to\ndecompile) 67% more code, and reduces key imprecision metrics by over 65%.\n","authors":["Sifis Lagouvardos","Yannis Bollanos","Neville Grech","Yannis Smaragdakis"],"pdf_url":"https://arxiv.org/pdf/2409.11157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11133v1","updated":"2024-09-17T12:32:38Z","published":"2024-09-17T12:32:38Z","title":"Towards Quantum Multiparty Session Types","summary":" Multiparty Session Types (MPSTs) offer a structured way of specifying\ncommunication protocols and guarantee relevant communication properties, such\nas deadlock-freedom. In this paper, we extend a minimal MPST system with\nquantum data and operations, enabling the specification of quantum protocols.\nQuantum MPSTs (QMPSTs) provide a formal notation to describe quantum protocols,\nboth at the abstract level of global types, describing which communications can\ntake place in the system and their dependencies, and at the concrete level of\nlocal types and quantum processes, describing the expected behavior of each\nparticipant in the protocol. Type-checking relates these two levels formally,\nensuring that processes behave as prescribed by the global type. Beyond usual\ncommunication properties, QMPSTs also allow us to prove that qubits are owned\nby a single process at any time, capturing the quantum no-cloning and\nno-deleting theorems. We use our approach to verify four quantum protocols from\nthe literature, respectively Teleportation, Secret Sharing, Bit-Commitment, and\nKey Distribution.\n","authors":["Ivan Lanese","Ugo Dal Lago","Vikraman Choudhury"],"pdf_url":"https://arxiv.org/pdf/2409.11133v1.pdf","comment":"To appear at SEFM 2024"},{"id":"http://arxiv.org/abs/2409.11106v1","updated":"2024-09-17T12:02:23Z","published":"2024-09-17T12:02:23Z","title":"Scheme Pearl: Quantum Continuations","summary":" We advance the thesis that the simulation of quantum circuits is\nfundamentally about the efficient management of a large (potentially\nexponential) number of delimited continuations. The family of Scheme languages,\nwith its efficient implementations of first-class continuations and with its\nimperative constructs, provides an elegant host for modeling and simulating\nquantum circuits.\n","authors":["Vikraman Choudhury","Borislav Agapiev","Amr Sabry"],"pdf_url":"https://arxiv.org/pdf/2409.11106v1.pdf","comment":"Appeared at Scheme Workshop 2022"},{"id":"http://arxiv.org/abs/2409.11015v1","updated":"2024-09-17T09:25:42Z","published":"2024-09-17T09:25:42Z","title":"Introducing Quantification into a Hierarchical Graph Rewriting Language","summary":" LMNtal is a programming and modeling language based on hierarchical graph\nrewriting that uses logical variables to represent connectivity and membranes\nto represent hierarchy. On the theoretical side, it allows logical\ninterpretation based on intuitionistic linear logic; on the practical side, its\nfull-fledged implementation supports a graph-based parallel model checker and\nhas been used to model diverse applications including various computational\nmodels. This paper discuss how we extend LMNtal to QLMNtal (LMNtal with\nQuantification) to further enhance the usefulness of hierarchical graph\nrewriting for high-level modeling by introducing quantifiers into rewriting as\nwell as matching. Those quantifiers allows us to express universal\nquantification, cardinality and non-existence in an integrated manner. Unlike\nother attempts to introduce quantifiers into graph rewriting, QLMNtal has\nterm-based syntax, whose semantics is smoothly integrated into the small-step\nsemantics of the base language LMNtal. The proposed constructs allow combined\nand nested use of quantifiers within individual rewrite rules.\n","authors":["Haruto Mishina","Kazunori Ueda"],"pdf_url":"https://arxiv.org/pdf/2409.11015v1.pdf","comment":"Extended version (with Appendix) of the paper presented at the 34th\n International Symposium on Logic-Based Program Synthesis and Transformation\n (LOPSTR 2024), Milano, Italy, September 2024, LNCS 14919, Springer-Verlag,\n pp.220-239. 26 pages"},{"id":"http://arxiv.org/abs/2409.11600v1","updated":"2024-09-17T23:15:39Z","published":"2024-09-17T23:15:39Z","title":"No Saved Kaleidosope: an 100% Jitted Neural Network Coding Language with\n Pythonic Syntax","summary":" We developed a jitted compiler for training Artificial Neural Networks using\nC++, LLVM and Cuda. It features object-oriented characteristics, strong typing,\nparallel workers for data pre-processing, pythonic syntax for expressions,\nPyTorch like model declaration and Automatic Differentiation. We implement the\nmechanisms of cache and pooling in order to manage VRAM, cuBLAS for high\nperformance matrix multiplication and cuDNN for convolutional layers. Our\nexperiments with Residual Convolutional Neural Networks on ImageNet, we reach\nsimilar speed but degraded performance. Also, the GRU network experiments show\nsimilar accuracy, but our compiler have degraded speed in that task. However,\nour compiler demonstrates promising results at the CIFAR-10 benchmark, in which\nwe reach the same performance and about the same speed as PyTorch. We make the\ncode publicly available at: https://github.com/NoSavedDATA/NoSavedKaleidoscope\n","authors":["Augusto Seben da Rosa","Marlon Daniel Angeli","Jorge Aikes Junior","Alef Iury Ferreira","Lucas Rafael Gris","Anderson da Silva Soares","Arnaldo Candido Junior","Frederico Santos de Oliveira","Gabriel Trevisan Damke","Rafael Teixeira Sousa"],"pdf_url":"https://arxiv.org/pdf/2409.11600v1.pdf","comment":"12 pages, 3 figures and 3 tables"},{"id":"http://arxiv.org/abs/2409.11530v1","updated":"2024-09-17T20:00:22Z","published":"2024-09-17T20:00:22Z","title":"Minuska: Towards a Formally Verified Programming Language Framework","summary":" Programming language frameworks allow us to generate language tools (e.g.,\ninterpreters) just from a formal description of the syntax and semantics of a\nprogramming language. As these frameworks tend to be quite complex, an issue\narises whether we can trust the generated tools. To address this issue, we\nintroduce a practical formal programming language framework called Minuska,\nwhich always generates a provably correct interpreter given a valid language\ndefinition. This is achieved by (1) defining a language MinusLang for\nexpressing programming language definitions and giving it formal semantics and\n(2) using the Coq proof assistant to implement an interpreter parametric in a\nMinusLang definition and to prove it correct. Minuska provides strong\ncorrectness guarantees and can support nontrivial languages while performing\nwell. This is the extended version of the SEFM24 paper of the same name.\n","authors":["Jan Tušil","Jan Obdržálek"],"pdf_url":"https://arxiv.org/pdf/2409.11530v1.pdf","comment":null}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2103.15453v4","updated":"2024-09-17T13:01:21Z","published":"2021-03-29T09:37:30Z","title":"Disentangling Parallelism and Interference in Game Semantics","summary":" Game semantics is a denotational semantics presenting compositionally the\ncomputational behaviour of various kinds of effectful programs. One of its\ncelebrated achievement is to have obtained full abstraction results for\nprogramming languages with a variety of computational effects, in a single\nframework. This is known as the semantic cube or Abramsky's cube, which for\nsequential deterministic programs establishes a correspondence between certain\nconditions on strategies (''innocence'', ''well-bracketing'', ''visibility'')\nand the absence of matching computational effects. Outside of the sequential\ndeterministic realm, there are still a wealth of game semantics-based full\nabstraction results; but they no longer fit in a unified canvas. In particular,\nGhica and Murawski's fully abstract model for shared state concurrency (IA)\ndoes not have a matching notion of pure parallel program-we say that\nparallelism and interference (i.e. state plus semaphores) are entangled. In\nthis paper we construct a causal version of Ghica and Murawski's model, also\nfully abstract for IA. We provide compositional conditions parallel innocence\nand sequentiality, respectively banning interference and parallelism, and\nleading to four full abstraction results. To our knowledge, this is the first\nextension of Abramsky's semantic cube programme beyond the sequential\ndeterministic world.\n","authors":["Simon Castellan","Pierre Clairambault"],"pdf_url":"https://arxiv.org/pdf/2103.15453v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11044v1","updated":"2024-09-17T10:08:14Z","published":"2024-09-17T10:08:14Z","title":"Computation and Complexity of Preference Inference Based on Hierarchical\n Models","summary":" Preference Inference involves inferring additional user preferences from\nelicited or observed preferences, based on assumptions regarding the form of\nthe user's preference relation. In this paper we consider a situation in which\nalternatives have an associated vector of costs, each component corresponding\nto a different criterion, and are compared using a kind of lexicographic order,\nsimilar to the way alternatives are compared in a Hierarchical Constraint Logic\nProgramming model. It is assumed that the user has some (unknown) importance\nordering on criteria, and that to compare two alternatives, firstly, the\ncombined cost of each alternative with respect to the most important criteria\nare compared; only if these combined costs are equal, are the next most\nimportant criteria considered. The preference inference problem then consists\nof determining whether a preference statement can be inferred from a set of\ninput preferences. We show that this problem is coNP-complete, even if one\nrestricts the cardinality of the equal-importance sets to have at most two\nelements, and one only considers non-strict preferences. However, it is\npolynomial if it is assumed that the user's ordering of criteria is a total\nordering; it is also polynomial if the sets of equally important criteria are\nall equivalence classes of a given fixed equivalence relation. We give an\nefficient polynomial algorithm for these cases, which also throws light on the\nstructure of the inference.\n","authors":["Nic Wilson","Anne-Marie George","Barry O'Sullivan"],"pdf_url":"https://arxiv.org/pdf/2409.11044v1.pdf","comment":"Longer Version of IJCAI'15 publication\n https://www.ijcai.org/Proceedings/15/Papers/461.pdf"},{"id":"http://arxiv.org/abs/2409.11587v1","updated":"2024-09-17T22:26:50Z","published":"2024-09-17T22:26:50Z","title":"Resource approximation for the $λμ$-calculus","summary":" The $\\lambda\\mu$-calculus plays a central role in the theory of programming\nlanguages as it extends the Curry-Howard correspondence to classical logic. A\nmajor drawback is that it does not satisfy B\\\"ohm's Theorem and it lacks the\ncorresponding notion of approximation. On the contrary, we show that Ehrhard\nand Regnier's Taylor expansion can be easily adapted, thus providing a resource\nconscious approximation theory. This produces a sensible $\\lambda\\mu$-theory\nwith which we prove some advanced properties of the $\\lambda\\mu$-calculus, such\nas Stability and Perpendicular Lines Property, from which the impossibility of\nparallel computations follows.\n","authors":["Davide Barbarossa"],"pdf_url":"https://arxiv.org/pdf/2409.11587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11572v1","updated":"2024-09-17T21:54:19Z","published":"2024-09-17T21:54:19Z","title":"Stability Property for the Call-by-Value $λ$-calculus through\n Taylor Expansion","summary":" We prove the Stability Property for the call-by-value $\\lambda$-calculus (CbV\nin the following). This result states necessary conditions under which the\ncontexts of the CbV $\\lambda$-calculus commute with intersections of\napproximants. This is an important non-trivial result, which implies the\nsequentiality of the calculus. We prove it via the tool of Taylor-resource\napproximation, whose power has been shown in several recent papers. This\ntechnique is usually conceived for the ordinary $\\lambda$-calculus, but it can\nbe easily defined for the CbV setting. Our proof is the adaptation of the one\nfor the ordinary calculus using the same technique, with some minimal technical\nmodification due to the fact that in the CbV setting one linearises terms in a\nslightly different way than usual (cfr. $!(A\\multimap B)$ vs $!A\\multimap B$).\nThe content of this article is taken from the PhD thesis of the author.\n","authors":["Davide Barbarossa"],"pdf_url":"https://arxiv.org/pdf/2409.11572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11566v1","updated":"2024-09-17T21:40:24Z","published":"2024-09-17T21:40:24Z","title":"Denotational semantics driven simplicial homology?","summary":" We look at the proofs of a fragment of Linear Logic as a whole: in fact,\nLinear Logic's coherent semantics interprets the proofs of a given formula $A$\nas faces of an abstract simplicial complex, thus allowing us to see the set of\nthe (interpretations of the) proofs of $A$ as a geometrical space, not just a\nset. This point of view has never been really investigated. For a ``webbed''\ndenotational semantics -- say the relational one --, it suffices to down-close\nthe set of (the interpretations of the) proofs of $A$ in order to give rise to\nan abstract simplicial complex whose faces do correspond to proofs of $A$.\nSince this space comes triangulated by construction, a natural geometrical\nproperty to consider is its homology. However, we immediately stumble on a\nproblem: if we want the homology to be invariant w.r.t. to some notion of\ntype-isomorphism, we are naturally led to consider the homology functor acting,\nat the level of morphisms, on ``simplicial relations'' rather than simplicial\nmaps as one does in topology. The task of defining the homology functor on this\nmodified category can be achieved by considering a very simple monad, which is\nalmost the same as the power-set monad; but, doing so, we end up considering\nnot anymore the homology of the original space, but rather of its\ntransformation under the action of the monad. Does this transformation keep the\nhomology invariant ? Is this transformation meaningful from a geometrical or\nlogical/computational point of view ?\n","authors":["Davide Barbarossa"],"pdf_url":"https://arxiv.org/pdf/2409.11566v1.pdf","comment":null}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2409.11220v1","updated":"2024-09-17T14:13:03Z","published":"2024-09-17T14:13:03Z","title":"eBPF-mm: Userspace-guided memory management in Linux with eBPF","summary":" We leverage eBPF in order to implement custom policies in the Linux memory\nsubsystem. Inspired by CBMM, we create a mechanism that provides the kernel\nwith hints regarding the benefit of promoting a page to a specific size. We\nintroduce a new hook point in Linux page fault handling path for eBPF programs,\nproviding them the necessary context to determine the page size to be used. We\nthen develop a framework that allows users to define profiles for their\napplications and load them into the kernel. A profile consists of memory\nregions of interest and their expected benefit from being backed by 4KB, 64KB\nand 2MB pages. In our evaluation, we profiled our workloads to identify hot\nmemory regions using DAMON.\n","authors":["Konstantinos Mores","Stratos Psomadakis","Georgios Goumas"],"pdf_url":"https://arxiv.org/pdf/2409.11220v1.pdf","comment":"ACM SRC@MICRO'24"},{"id":"http://arxiv.org/abs/2409.10918v1","updated":"2024-09-17T06:23:12Z","published":"2024-09-17T06:23:12Z","title":"FSL-HDnn: A 5.7 TOPS/W End-to-end Few-shot Learning Classifier\n Accelerator with Feature Extraction and Hyperdimensional Computing","summary":" This paper introduces FSL-HDnn, an energy-efficient accelerator that\nimplements the end-to-end pipeline of feature extraction, classification, and\non-chip few-shot learning (FSL) through gradient-free learning techniques in a\n40 nm CMOS process. At its core, FSL-HDnn integrates two low-power modules:\nWeight clustering feature extractor and Hyperdimensional Computing (HDC).\nFeature extractor utilizes advanced weight clustering and pattern reuse\nstrategies for optimized CNN-based feature extraction. Meanwhile, HDC emerges\nas a novel approach for lightweight FSL classifier, employing hyperdimensional\nvectors to improve training accuracy significantly compared to traditional\ndistance-based approaches. This dual-module synergy not only simplifies the\nlearning process by eliminating the need for complex gradients but also\ndramatically enhances energy efficiency and performance. Specifically, FSL-HDnn\nachieves an Intensity unprecedented energy efficiency of 5.7 TOPS/W for feature\n1 extraction and 0.78 TOPS/W for classification and learning Training Intensity\nphases, achieving improvements of 2.6X and 6.6X, respectively, Storage over\ncurrent state-of-the-art CNN and FSL processors.\n","authors":["Haichao Yang","Chang Eun Song","Weihong Xu","Behnam Khaleghi","Uday Mallappa","Monil Shah","Keming Fan","Mingu Kang","Tajana Rosing"],"pdf_url":"https://arxiv.org/pdf/2409.10918v1.pdf","comment":"4 pages, 12 figures, ESSERC 2024"}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2409.11392v1","updated":"2024-09-17T17:50:52Z","published":"2024-09-17T17:50:52Z","title":"Temporal Load Imbalance on Ondes3D Seismic Simulator for Different\n Multicore Architectures","summary":" The variety of today's multicore architectures motivates researchers to\nexplore parallel scientific applications on different platforms. Load imbalance\nis one performance issue that can prejudice parallel applications from\nexploiting the computational power of these platforms. Ondes3D is a scientific\napplication for seismic wave simulation used to assess the geological impact of\nearthquakes. Its parallelism relies on applying a regular domain decomposition\nin the geological domain provided and distributing each sub-domain to MPI\nranks. Previous works investigate the significant spatial and temporal\nimbalance in Ondes3D and suggest new parallelization and load balancing\ntechniques to minimize them. However, none explored its execution on different\narchitectures. Our paper evaluates the performance of Ondes3D for two\nearthquake scenarios on eight different multicore architectures, including\nIntel, AMD, and ARM processors. We measure the load distribution per MPI rank,\nevaluate the temporal load imbalance, and compare the execution of the\napplication's kernels. Our results show that the temporal load imbalance in\nOndes3D depends on the architecture chosen, with some platforms minimizing such\nimbalance more effectively.\n","authors":["Ana Luisa Veroneze Solórzano","Philippe Olivier Alexandre Navaux","Lucas Mello Schnorr"],"pdf_url":"https://arxiv.org/pdf/2409.11392v1.pdf","comment":"The 2020 International Conference on High Performance Computing and\n Simulation (HPCS 2020)"},{"id":"http://arxiv.org/abs/2402.09222v2","updated":"2024-09-17T16:52:28Z","published":"2024-02-14T15:01:21Z","title":"Integrating ytopt and libEnsemble to Autotune OpenMC","summary":" ytopt is a Python machine-learning-based autotuning software package\ndeveloped within the ECP PROTEAS-TUNE project. The ytopt software adopts an\nasynchronous search framework that consists of sampling a small number of input\nparameter configurations and progressively fitting a surrogate model over the\ninput-output space until exhausting the user-defined maximum number of\nevaluations or the wall-clock time. libEnsemble is a Python toolkit for\ncoordinating workflows of asynchronous and dynamic ensembles of calculations\nacross massively parallel resources developed within the ECP PETSc/TAO project.\nlibEnsemble helps users take advantage of massively parallel resources to solve\ndesign, decision, and inference problems and expands the class of problems that\ncan benefit from increased parallelism. In this paper we present our\nmethodology and framework to integrate ytopt and libEnsemble to take advantage\nof massively parallel resources to accelerate the autotuning process.\nSpecifically, we focus on using the proposed framework to autotune the ECP\nExaSMR application OpenMC, an open source Monte Carlo particle transport code.\nOpenMC has seven tunable parameters some of which have large ranges such as the\nnumber of particles in-flight, which is in the range of 100,000 to 8 million,\nwith its default setting of 1 million. Setting the proper combination of these\nparameter values to achieve the best performance is extremely time-consuming.\nTherefore, we apply the proposed framework to autotune the MPI/OpenMP offload\nversion of OpenMC based on a user-defined metric such as the figure of merit\n(FoM) (particles/s) or energy efficiency energy-delay product (EDP) on Crusher\nat Oak Ridge Leadership Computing Facility. The experimental results show that\nwe achieve improvement up to 29.49\\% in FoM and up to 30.44\\% in EDP.\n","authors":["Xingfu Wu","John R. Tramm","Jeffrey Larson","John-Luke Navarro","Prasanna Balaprakash","Brice Videau","Michael Kruse","Paul Hovland","Valerie Taylor","Mary Hall"],"pdf_url":"https://arxiv.org/pdf/2402.09222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11129v1","updated":"2024-09-17T12:28:02Z","published":"2024-09-17T12:28:02Z","title":"Can Graph Reordering Speed Up Graph Neural Network Training? An\n Experimental Study","summary":" Graph neural networks (GNNs) are a type of neural network capable of learning\non graph-structured data. However, training GNNs on large-scale graphs is\nchallenging due to iterative aggregations of high-dimensional features from\nneighboring vertices within sparse graph structures combined with neural\nnetwork operations. The sparsity of graphs frequently results in suboptimal\nmemory access patterns and longer training time. Graph reordering is an\noptimization strategy aiming to improve the graph data layout. It has shown to\nbe effective to speed up graph analytics workloads, but its effect on the\nperformance of GNN training has not been investigated yet. The generalization\nof reordering to GNN performance is nontrivial, as multiple aspects must be\nconsidered: GNN hyper-parameters such as the number of layers, the number of\nhidden dimensions, and the feature size used in the GNN model, neural network\noperations, large intermediate vertex states, and GPU acceleration.\n In our work, we close this gap by performing an empirical evaluation of 12\nreordering strategies in two state-of-the-art GNN systems, PyTorch Geometric\nand Deep Graph Library. Our results show that graph reordering is effective in\nreducing training time for CPU- and GPU-based training, respectively. Further,\nwe find that GNN hyper-parameters influence the effectiveness of reordering,\nthat reordering metrics play an important role in selecting a reordering\nstrategy, that lightweight reordering performs better for GPU-based than for\nCPU-based training, and that invested reordering time can in many cases be\namortized.\n","authors":["Nikolai Merkel","Pierre Toussing","Ruben Mayer","Hans-Arno Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2409.11129v1.pdf","comment":"To be published in proceedings of the 51st International Conference\n on Very Large Data Bases (VLDB), September 1-5, 2025"}],"Operation Systems":[{"id":"http://arxiv.org/abs/2409.11271v1","updated":"2024-09-17T15:21:47Z","published":"2024-09-17T15:21:47Z","title":"Analysis of Synchronization Mechanisms in Operating Systems","summary":" This research analyzed the performance and consistency of four\nsynchronization mechanisms-reentrant locks, semaphores, synchronized methods,\nand synchronized blocks-across three operating systems: macOS, Windows, and\nLinux. Synchronization ensures that concurrent processes or threads access\nshared resources safely, and efficient synchronization is vital for maintaining\nsystem performance and reliability. The study aimed to identify the\nsynchronization mechanism that balances efficiency, measured by execution time,\nand consistency, assessed by variance and standard deviation, across platforms.\nThe initial hypothesis proposed that mutex-based mechanisms, specifically\nsynchronized methods and blocks, would be the most efficient due to their\nsimplicity. However, empirical results showed that reentrant locks had the\nlowest average execution time (14.67ms), making them the most efficient\nmechanism, but with the highest variability (standard deviation of 1.15). In\ncontrast, synchronized methods, blocks, and semaphores exhibited higher average\nexecution times (16.33ms for methods and 16.67ms for blocks) but with greater\nconsistency (variance of 0.33). The findings indicated that while reentrant\nlocks were faster, they were more platform-dependent, whereas mutex-based\nmechanisms provided more predictable performance across all operating systems.\nThe use of virtual machines for Windows and Linux was a limitation, potentially\naffecting the results. Future research should include native testing and\nexplore additional synchronization mechanisms and higher concurrency levels.\nThese insights help developers and system designers optimize synchronization\nstrategies for either performance or stability, depending on the application's\nrequirements.\n","authors":["Oluwatoyin Kode","Temitope Oyemade"],"pdf_url":"https://arxiv.org/pdf/2409.11271v1.pdf","comment":"This paper was submitted to the 2nd International Conference on\n Computer Science and Software Engineering (CSSE 2024). It contains 19 pages"},{"id":"http://arxiv.org/abs/2409.11220v1","updated":"2024-09-17T14:13:03Z","published":"2024-09-17T14:13:03Z","title":"eBPF-mm: Userspace-guided memory management in Linux with eBPF","summary":" We leverage eBPF in order to implement custom policies in the Linux memory\nsubsystem. Inspired by CBMM, we create a mechanism that provides the kernel\nwith hints regarding the benefit of promoting a page to a specific size. We\nintroduce a new hook point in Linux page fault handling path for eBPF programs,\nproviding them the necessary context to determine the page size to be used. We\nthen develop a framework that allows users to define profiles for their\napplications and load them into the kernel. A profile consists of memory\nregions of interest and their expected benefit from being backed by 4KB, 64KB\nand 2MB pages. In our evaluation, we profiled our workloads to identify hot\nmemory regions using DAMON.\n","authors":["Konstantinos Mores","Stratos Psomadakis","Georgios Goumas"],"pdf_url":"https://arxiv.org/pdf/2409.11220v1.pdf","comment":"ACM SRC@MICRO'24"},{"id":"http://arxiv.org/abs/2409.10946v1","updated":"2024-09-17T07:28:56Z","published":"2024-09-17T07:28:56Z","title":"Skip TLB flushes for reused pages within mmap's","summary":" Memory access efficiency is significantly enhanced by caching recent address\ntranslations in the CPUs' Translation Lookaside Buffers (TLBs). However, since\nthe operating system is not aware of which core is using a particular mapping,\nit flushes TLB entries across all cores where the application runs whenever\naddresses are unmapped, ensuring security and consistency. These TLB flushes,\nknown as TLB shootdowns, are costly and create a performance and scalability\nbottleneck. A key contributor to TLB shootdowns is memory-mapped I/O,\nparticularly during mmap-munmap cycles and page cache evictions. Often, the\nsame physical pages are reassigned to the same process post-eviction,\npresenting an opportunity for the operating system to reduce the frequency of\nTLB shootdowns. We demonstrate, that by slightly extending the mmap function,\nTLB shootdowns for these \"recycled pages\" can be avoided.\n Therefore we introduce and implement the \"fast page recycling\" (FPR) feature\nwithin the mmap system call. FPR-mmaps maintain security by only triggering TLB\nshootdowns when a page exits its recycling cycle and is allocated to a\ndifferent process. To ensure consistency when FPR-mmap pointers are used, we\nmade minor adjustments to virtual memory management to avoid the ABA problem.\nUnlike previous methods to mitigate shootdown effects, our approach does not\nrequire any hardware modifications and operates transparently within the\nexisting Linux virtual memory framework.\n Our evaluations across a variety of CPU, memory, and storage setups,\nincluding persistent memory and Optane SSDs, demonstrate that FPR delivers\nnotable performance gains, with improvements of up to 28% in real-world\napplications and 92% in micro-benchmarks. Additionally, we show that TLB\nshootdowns are a significant source of bottlenecks, previously misattributed to\nother components of the Linux kernel.\n","authors":["Frederic Schimmelpfennig","André Brinkmann","Hossein Asadi","Reza Salkhordeh"],"pdf_url":"https://arxiv.org/pdf/2409.10946v1.pdf","comment":null}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2409.11079v1","updated":"2024-09-17T11:17:56Z","published":"2024-09-17T11:17:56Z","title":"The Complexity of Maximizing the MST-ratio","summary":" Given a finite set of red and blue points in $\\mathbb{R}^d$, the MST-ratio is\nthe combined length of the Euclidean minimum spanning trees of red points and\nof blue points divided by the length of the Euclidean minimum spanning tree of\nthe union of them. The maximum MST-ratio of a point set is the maximum\nMST-ratio over all non-trivial colorings of its points by red and blue. We\nprove that the problem of finding the maximum MST-ratio of a given point set is\nNP-hard when the dimension is a part of the input. Moreover, we present a\n$O(n^2)$ running time $3$-approximation algorithm for it. As a part of the\nproof, we show that in any metric space, the maximum MST-ratio is smaller than\n$3$. Additionally, we study the average MST-ratio over all colorings of a set\nof $n$ points. We show that this average is always at least $\\frac{n-2}{n-1}$,\nand for $n$ random points uniformly distributed in a $d$-dimensional unit cube,\nthe average tends to $\\sqrt[d]{2}$ in expectation as $n$ goes to infinity.\n","authors":["Afrouz Jabal Ameli","Faezeh Motiei","Morteza Saghafian"],"pdf_url":"https://arxiv.org/pdf/2409.11079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02190v2","updated":"2024-09-17T06:56:57Z","published":"2023-11-03T18:47:39Z","title":"The Tensor as an Informational Resource","summary":" A tensor is a multidimensional array of numbers that can be used to store\ndata, encode a computational relation and represent quantum entanglement. In\nthis sense a tensor can be viewed as valuable resource whose transformation can\nlead to an understanding of structure in data, computational complexity and\nquantum information.\n In order to facilitate the understanding of this resource, we propose a\nfamily of information-theoretically constructed preorders on tensors, which can\nbe used to compare tensors with each other and to assess the existence of\ntransformations between them. The construction places copies of a given tensor\nat the edges of a hypergraph and allows transformations at the vertices. A\npreorder is then induced by the transformations possible in a given growing\nsequence of hypergraphs. The new family of preorders generalises the asymptotic\nrestriction preorder which Strassen defined in order to study the computational\ncomplexity of matrix multiplication.\n We derive general properties of the preorders and their associated asymptotic\nnotions of tensor rank and view recent results on tensor rank non-additivity,\ntensor networks and algebraic complexity in this unifying frame. We hope that\nthis work will provide a useful vantage point for exploring tensors in applied\nmathematics, physics and computer science, but also from a purely mathematical\npoint of view.\n","authors":["Matthias Christandl"],"pdf_url":"https://arxiv.org/pdf/2311.02190v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2409.11597v1","updated":"2024-09-17T23:09:25Z","published":"2024-09-17T23:09:25Z","title":"The Sample Complexity of Smooth Boosting and the Tightness of the\n Hardcore Theorem","summary":" Smooth boosters generate distributions that do not place too much weight on\nany given example. Originally introduced for their noise-tolerant properties,\nsuch boosters have also found applications in differential privacy,\nreproducibility, and quantum learning theory. We study and settle the sample\ncomplexity of smooth boosting: we exhibit a class that can be weak learned to\n$\\gamma$-advantage over smooth distributions with $m$ samples, for which strong\nlearning over the uniform distribution requires\n$\\tilde{\\Omega}(1/\\gamma^2)\\cdot m$ samples. This matches the overhead of\nexisting smooth boosters and provides the first separation from the setting of\ndistribution-independent boosting, for which the corresponding overhead is\n$O(1/\\gamma)$.\n Our work also sheds new light on Impagliazzo's hardcore theorem from\ncomplexity theory, all known proofs of which can be cast in the framework of\nsmooth boosting. For a function $f$ that is mildly hard against size-$s$\ncircuits, the hardcore theorem provides a set of inputs on which $f$ is\nextremely hard against size-$s'$ circuits. A downside of this important result\nis the loss in circuit size, i.e. that $s' \\ll s$. Answering a question of\nTrevisan, we show that this size loss is necessary and in fact, the parameters\nachieved by known proofs are the best possible.\n","authors":["Guy Blanc","Alexandre Hayderi","Caleb Koch","Li-Yang Tan"],"pdf_url":"https://arxiv.org/pdf/2409.11597v1.pdf","comment":"46 pages, FOCS 2024"}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2409.10822v1","updated":"2024-09-17T01:28:42Z","published":"2024-09-17T01:28:42Z","title":"Query Learning of Advice and Nominal Automata","summary":" Learning automata by queries is a long-studied area initiated by Angluin in\n1987 with the introduction of the $L^*$ algorithm to learn regular languages,\nwith a large body of work afterwards on many different variations and\ngeneralizations of DFAs. Recently, Chase and Freitag introduced a novel\napproach to proving query learning bounds by computing combinatorial complexity\nmeasures for the classes in question, which they applied to the setting of DFAs\nto obtain qualitatively different results compared to the $L^*$ algorithm.\nUsing this approach, we prove new query learning bounds for two generalizations\nof DFAs. The first setting is that of advice DFAs, which are DFAs augmented\nwith an advice string that informs the DFA's transition behavior at each step.\nFor advice DFAs, we give the first known upper bounds for query complexity. The\nsecond setting is that of nominal DFAs, which generalize DFAs to infinite\nalphabets which admit some structure via symmetries. For nominal DFAs, we make\nqualitative improvements over prior results.\n","authors":["Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.10822v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2205.03515v5","updated":"2024-09-17T21:03:38Z","published":"2022-05-07T01:06:52Z","title":"Standard Automata Theory and Process Algebra","summary":" The concepts of machine homomorphism and machine products developed in the\nautomata theory literature in the 1960s are more relevant to concurrent systems\nthan is acknowledged in the process algebra literature and offer a\nsophisticated mathematical basis for understanding concurrent systems.\n","authors":["Victor Yodaiken"],"pdf_url":"https://arxiv.org/pdf/2205.03515v5.pdf","comment":"fixes a number of typographical errors and sub-optimal phrasings"}]},"2024-09-18T00:00:00Z":{"Computational Complexity":[{"id":"http://arxiv.org/abs/2204.11753v6","updated":"2024-09-18T08:55:00Z","published":"2022-04-25T16:14:49Z","title":"Partitioning Problems with Splittings and Interval Targets","summary":" The $n$-way number partitioning problem is a classic problem in combinatorial\noptimization, with applications to diverse settings such as fair allocation and\nmachine scheduling. All these problems are NP-hard, but various approximation\nalgorithms are known. We consider three closely related kinds of\napproximations.\n The first two variants optimize the partition such that: in the first variant\nsome fixed number $s$ of items can be \\emph{split} between two or more bins and\nin the second variant we allow at most a fixed number $t$ of \\emph{splittings}.\nThe third variant is a decision problem: the largest bin sum must be within a\npre-specified interval, parameterized by a fixed rational number $u$ times the\nlargest item size.\n When the number of bins $n$ is unbounded, we show that every variant is\nstrongly {\\sf NP}-complete. When the number of bins $n$ is fixed, the running\ntime depends on the fixed parameters $s,t,u$. For each variant, we give a\ncomplete picture of its running time.\n For $n=2$, the running time is easy to identify. Our main results consider\nany fixed integer $n \\geq 3$. Using a two-way polynomial-time reduction between\nthe first and the third variant, we show that $n$-way number-partitioning with\n$s$ split items can be solved in polynomial time if $s \\geq n-2$, and it is\n{\\sf NP}-complete otherwise. Also, $n$-way number-partitioning with $t$\nsplittings can be solved in polynomial time if $t \\geq n-1$, and it is {\\sf\nNP}-complete otherwise. Finally, we show that the third variant can be solved\nin polynomial time if $u \\geq (n-2)/n$, and it is {\\sf NP}-complete otherwise.\nOur positive results for the optimization problems consider both min-max and\nmax-min versions.\n Using the same reduction, we provide a fully polynomial-time approximation\nscheme for the case where the number of split items is lower than $n-2$.\n","authors":["Samuel Bismuth","Vladislav Makarov","Erel Segal-Halevi","Dana Shapira"],"pdf_url":"https://arxiv.org/pdf/2204.11753v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09641v3","updated":"2024-09-18T15:09:58Z","published":"2024-08-19T01:58:54Z","title":"Character Complexity: A Novel Measure for Quantum Circuit Analysis","summary":" In the rapidly evolving field of quantum computing, quantifying circuit\ncomplexity remains a critical challenge. This paper introduces Character\nComplexity, a novel measure that bridges Group-theoretic concepts with\npractical quantum computing concerns. By leveraging tools from representation\ntheory, I prove several key properties of character complexity and establish a\nsurprising connection to the classical simulability of quantum circuits. This\nnew measure offers a fresh perspective on the complexity landscape of quantum\nalgorithms, potentially reshaping our understanding of quantum-classical\ncomputational boundaries. I present innovative visualization methods for\ncharacter complexity, providing intuitive insights into the structure of\nquantum circuits. The empirical results reveal intriguing scaling behaviors\nwith respect to qubit and gate counts, opening new avenues for quantum\nalgorithm design and optimization. This work not only contributes to the\ntheoretical foundations of quantum complexity but also offers practical tools\nfor the quantum computing community. As quantum hardware continues to advance,\ncharacter complexity could play a crucial role in developing more efficient\nquantum algorithms and in exploring the fundamental limits of quantum\ncomputation.\n","authors":["Daksh Shami"],"pdf_url":"https://arxiv.org/pdf/2408.09641v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12179v1","updated":"2024-09-18T17:51:48Z","published":"2024-09-18T17:51:48Z","title":"Computational Dynamical Systems","summary":" We study the computational complexity theory of smooth, finite-dimensional\ndynamical systems. Building off of previous work, we give definitions for what\nit means for a smooth dynamical system to simulate a Turing machine. We then\nshow that 'chaotic' dynamical systems (more precisely, Axiom A systems) and\n'integrable' dynamical systems (more generally, measure-preserving systems)\ncannot robustly simulate universal Turing machines, although such machines can\nbe robustly simulated by other kinds of dynamical systems. Subsequently, we\nshow that any Turing machine that can be encoded into a structurally stable\none-dimensional dynamical system must have a decidable halting problem, and\nmoreover an explicit time complexity bound in instances where it does halt.\nMore broadly, our work elucidates what it means for one 'machine' to simulate\nanother, and emphasizes the necessity of defining low-complexity 'encoders' and\n'decoders' to translate between the dynamics of the simulation and the system\nbeing simulated. We highlight how the notion of a computational dynamical\nsystem leads to questions at the intersection of computational complexity\ntheory, dynamical systems theory, and real algebraic geometry.\n","authors":["Jordan Cotler","Semon Rezchikov"],"pdf_url":"https://arxiv.org/pdf/2409.12179v1.pdf","comment":"46+14 pages, 6 figures; accepted to FOCS 2024"},{"id":"http://arxiv.org/abs/2405.01017v2","updated":"2024-09-18T00:27:31Z","published":"2024-05-02T05:38:57Z","title":"NP-completeness of Tiling Finite Simply Connected Regions with a Fixed\n Set of Wang Tiles","summary":" The computational complexity of tiling finite simply connected regions with a\nfixed set of tiles is studied in this paper. We show that the problem of tiling\nsimply connected regions with a fixed set of $23$ Wang tiles is NP-complete. As\na consequence, the problem of tiling simply connected regions with a fixed set\nof $111$ rectangles is NP-complete. Our results improve that of Igor Pak and\nJed Yang by using fewer numbers of tiles. Notably in the case of Wang tiles,\nthe number has decreased by more than one third from $35$ to $23$.\n","authors":["Chao Yang","Zhujun Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.01017v2.pdf","comment":null}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2407.14705v2","updated":"2024-09-18T08:57:59Z","published":"2024-07-19T23:42:28Z","title":"Reactive graphs in action (extended version)","summary":" Reactive graphs are transition structures whereas edges become active and\ninactive during its evolution, that were introduced by Dov Gabbay from a\nmathematical's perspective. This paper presents Marge\n(https://fm-dcc.github.io/MARGe), a web-based tool to visualise and analyse\nreactive graphs enriched with labels. Marge animates the operational semantics\nof reactive graphs and offers different graphical views to provide insights\nover concrete systems. We motivate the applicability of reactive graphs for\nadaptive systems and for featured transition systems, using Marge to tighten\nthe gap between the existing theoretical models and their usage to analyse\nconcrete systems.\n","authors":["David Tinoco","Alexandre Madeira","Manuel A. Martins","José Proença"],"pdf_url":"https://arxiv.org/pdf/2407.14705v2.pdf","comment":"Companion paper of an article accepted at FACS 2024 with a similar\n name"},{"id":"http://arxiv.org/abs/2409.12179v1","updated":"2024-09-18T17:51:48Z","published":"2024-09-18T17:51:48Z","title":"Computational Dynamical Systems","summary":" We study the computational complexity theory of smooth, finite-dimensional\ndynamical systems. Building off of previous work, we give definitions for what\nit means for a smooth dynamical system to simulate a Turing machine. We then\nshow that 'chaotic' dynamical systems (more precisely, Axiom A systems) and\n'integrable' dynamical systems (more generally, measure-preserving systems)\ncannot robustly simulate universal Turing machines, although such machines can\nbe robustly simulated by other kinds of dynamical systems. Subsequently, we\nshow that any Turing machine that can be encoded into a structurally stable\none-dimensional dynamical system must have a decidable halting problem, and\nmoreover an explicit time complexity bound in instances where it does halt.\nMore broadly, our work elucidates what it means for one 'machine' to simulate\nanother, and emphasizes the necessity of defining low-complexity 'encoders' and\n'decoders' to translate between the dynamics of the simulation and the system\nbeing simulated. We highlight how the notion of a computational dynamical\nsystem leads to questions at the intersection of computational complexity\ntheory, dynamical systems theory, and real algebraic geometry.\n","authors":["Jordan Cotler","Semon Rezchikov"],"pdf_url":"https://arxiv.org/pdf/2409.12179v1.pdf","comment":"46+14 pages, 6 figures; accepted to FOCS 2024"},{"id":"http://arxiv.org/abs/2409.12068v1","updated":"2024-09-18T15:41:43Z","published":"2024-09-18T15:41:43Z","title":"The repetition threshold for ternary rich words","summary":" In 2014, Vesti proposed the problem of determining the repetition threshold\nfor infinite rich words, i.e., for infinite words in which all factors of\nlength $n$ contain $n$ distinct nonempty palindromic factors. In 2020, Currie,\nMol, and Rampersad proved a conjecture of Baranwal and Shallit that the\nrepetition threshold for binary rich words is $2 + \\sqrt{2}/2$. In this paper,\nwe prove a structure theorem for $16/7$-power-free ternary rich words. Using\nthe structure theorem, we deduce that the repetition threshold for ternary rich\nwords is $1 + 1/(3 - \\mu) \\approx 2.25876324$, where $\\mu$ is the unique real\nroot of the polynomial $x^3 - 2x^2 - 1$.\n","authors":["James D. Currie","Lucas Mol","Jarkko Peltomäki"],"pdf_url":"https://arxiv.org/pdf/2409.12068v1.pdf","comment":"60 pages"},{"id":"http://arxiv.org/abs/2409.12029v1","updated":"2024-09-18T14:45:23Z","published":"2024-09-18T14:45:23Z","title":"Biological arrow of time: Emergence of tangled information hierarchies\n and self-modelling dynamics","summary":" We study open-ended evolution by focusing on computational and\ninformation-processing dynamics underlying major evolutionary transitions. In\ndoing so, we consider biological organisms as hierarchical dynamical systems\nthat generate regularities in their phase-spaces through interactions with\ntheir environment. These emergent information patterns can then be encoded\nwithin the organism's components, leading to self-modelling \"tangled\nhierarchies\". Our main conjecture is that when macro-scale patterns are encoded\nwithin micro-scale components, it creates fundamental tensions (computational\ninconsistencies) between what is encodable at a particular evolutionary stage\nand what is potentially realisable in the environment. A resolution of these\ntensions triggers an evolutionary transition which expands the problem-space,\nat the cost of generating new tensions in the expanded space, in a continual\nprocess. We argue that biological complexification can be interpreted\ncomputation-theoretically, within the G\\\"odel--Turing--Post recursion-theoretic\nframework, as open-ended generation of computational novelty. In general, this\nprocess can be viewed as a meta-simulation performed by higher-order systems\nthat successively simulate the computation carried out by lower-order systems.\nThis computation-theoretic argument provides a basis for hypothesising the\nbiological arrow of time.\n","authors":["Mikhail Prokopenko","Paul C. W. Davies","Michael Harré","Marcus Heisler","Zdenka Kuncic","Geraint F. Lewis","Ori Livson","Joseph T. Lizier","Fernando E. Rosas"],"pdf_url":"https://arxiv.org/pdf/2409.12029v1.pdf","comment":"30 pages, 13 figures"},{"id":"http://arxiv.org/abs/2406.15540v2","updated":"2024-09-18T08:21:29Z","published":"2024-06-21T17:39:57Z","title":"Specify What? Enhancing Neural Specification Synthesis by Symbolic\n Methods","summary":" We investigate how combinations of Large Language Models (LLMs) and symbolic\nanalyses can be used to synthesise specifications of C programs. The LLM\nprompts are augmented with outputs from two formal methods tools in the Frama-C\necosystem, Pathcrawler and EVA, to produce C program annotations in the\nspecification language ACSL. We demonstrate how the addition of symbolic\nanalysis to the workflow impacts the quality of annotations: information about\ninput/output examples from Pathcrawler produce more context-aware annotations,\nwhile the inclusion of EVA reports yields annotations more attuned to runtime\nerrors. In addition, we show that the method infers rather the programs intent\nthan its behaviour, by generating specifications for buggy programs and\nobserving robustness of the result against bugs.\n","authors":["George Granberry","Wolfgang Ahrendt","Moa Johansson"],"pdf_url":"https://arxiv.org/pdf/2406.15540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1906.04199v6","updated":"2024-09-18T08:09:05Z","published":"2019-05-15T11:35:35Z","title":"Synthesis of Computable Regular Functions of Infinite Words","summary":" Regular functions from infinite words to infinite words can be equivalently\nspecified by MSO-transducers, streaming $\\omega$-string transducers as well as\ndeterministic two-way transducers with look-ahead. In their one-way\nrestriction, the latter transducers define the class of rational functions.\nEven though regular functions are robustly characterised by several\nfinite-state devices, even the subclass of rational functions may contain\nfunctions which are not computable (by a Turing machine with infinite input).\nThis paper proposes a decision procedure for the following synthesis problem:\ngiven a regular function $f$ (equivalently specified by one of the\naforementioned transducer model), is $f$ computable and if it is, synthesize a\nTuring machine computing it.\n For regular functions, we show that computability is equivalent to\ncontinuity, and therefore the problem boils down to deciding continuity. We\nestablish a generic characterisation of continuity for functions preserving\nregular languages under inverse image (such as regular functions). We exploit\nthis characterisation to show the decidability of continuity (and hence\ncomputability) of rational and regular functions. For rational functions, we\nshow that this can be done in $\\mathsf{NLogSpace}$ (it was already known to be\nin $\\mathsf{PTime}$ by Prieur). In a similar fashion, we also effectively\ncharacterise uniform continuity of regular functions, and relate it to the\nnotion of uniform computability, which offers stronger efficiency guarantees.\n","authors":["V. Dave","E. Filiot","S. Krishna","N. Lhote"],"pdf_url":"https://arxiv.org/pdf/1906.04199v6.pdf","comment":null}],"Hardware Architecturea":[{"id":"http://arxiv.org/abs/2406.14706v2","updated":"2024-09-18T13:52:54Z","published":"2024-06-20T19:58:28Z","title":"WAGONN: Weight Bit Agglomeration in Crossbar Arrays for Reduced Impact\n of Interconnect Resistance on DNN Inference Accuracy","summary":" Deep neural network (DNN) accelerators employing crossbar arrays capable of\nin-memory computing (IMC) are highly promising for neural computing platforms.\nHowever, in deeply scaled technologies, interconnect resistance severely\nimpairs IMC robustness, leading to a drop in the system accuracy. To address\nthis problem, we propose SWANN - a technique based on shuffling weights in\ncrossbar arrays which alleviates the detrimental effect of wire resistance on\nIMC. For 8T-SRAM-based 128x128 crossbar arrays in 7nm technology, SWANN\nenhances the accuracy from 47.78% to 83.5% for ResNet-20/CIFAR-10. We also show\nthat SWANN can be used synergistically with Partial-Word-LineActivation,\nfurther boosting the accuracy. Moreover, we evaluate the implications of SWANN\nfor compact ferroelectric-transistorbased crossbar arrays. SWANN incurs minimal\nhardware overhead, with less than a 1% increase in energy consumption.\nAdditionally, the latency and area overheads of SWANN are ~1% and ~16%,\nrespectively when 1 ADC is utilized per crossbar array.\n","authors":["Jeffry Victor","Dong Eun Kim","Chunguang Wang","Kaushik Roy","Sumeet Gupta"],"pdf_url":"https://arxiv.org/pdf/2406.14706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11612v1","updated":"2024-09-18T00:23:00Z","published":"2024-09-18T00:23:00Z","title":"Hardware-Friendly Implementation of Physical Reservoir Computing with\n CMOS-based Time-domain Analog Spiking Neurons","summary":" This paper introduces an analog spiking neuron that utilizes time-domain\ninformation, i.e., a time interval of two signal transitions and a pulse width,\nto construct a spiking neural network (SNN) for a hardware-friendly physical\nreservoir computing (RC) on a complementary metal-oxide-semiconductor (CMOS)\nplatform. A neuron with leaky integrate-and-fire is realized by employing two\nvoltage-controlled oscillators (VCOs) with opposite sensitivities to the\ninternal control voltage, and the neuron connection structure is restricted by\nthe use of only 4 neighboring neurons on the 2-dimensional plane to feasibly\nconstruct a regular network topology. Such a system enables us to compose an\nSNN with a counter-based readout circuit, which simplifies the hardware\nimplementation of the SNN. Moreover, another technical advantage thanks to the\nbottom-up integration is the capability of dynamically capturing every neuron\nstate in the network, which can significantly contribute to finding guidelines\non how to enhance the performance for various computational tasks in temporal\ninformation processing. Diverse nonlinear physical dynamics needed for RC can\nbe realized by collective behavior through dynamic interaction between neurons,\nlike coupled oscillators, despite the simple network structure. With behavioral\nsystem-level simulations, we demonstrate physical RC through short-term memory\nand exclusive OR tasks, and the spoken digit recognition task with an accuracy\nof 97.7% as well. Our system is considerably feasible for practical\napplications and also can be a useful platform for studying the mechanism of\nphysical RC.\n","authors":["Nanako Kimura","Ckristian Duran","Zolboo Byambadorj","Ryosho Nakane","Tetsuya Iizuka"],"pdf_url":"https://arxiv.org/pdf/2409.11612v1.pdf","comment":null}],"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2409.11765v1","updated":"2024-09-18T07:44:56Z","published":"2024-09-18T07:44:56Z","title":"Massively parallel CMA-ES with increasing population","summary":" The Increasing Population Covariance Matrix Adaptation Evolution Strategy\n(IPOP-CMA-ES) algorithm is a reference stochastic optimizer dedicated to\nblackbox optimization, where no prior knowledge about the underlying problem\nstructure is available. This paper aims at accelerating IPOP-CMA-ES thanks to\nhigh performance computing and parallelism when solving large optimization\nproblems. We first show how BLAS and LAPACK routines can be introduced in\nlinear algebra operations, and we then propose two strategies for deploying\nIPOP-CMA-ES efficiently on large-scale parallel architectures with thousands of\nCPU cores. The first parallel strategy processes the multiple searches in the\nsame ordering as the sequential IPOP-CMA-ES, while the second one processes\nconcurrently these multiple searches. These strategies are implemented in\nMPI+OpenMP and compared on 6144 cores of the supercomputer Fugaku. We manage to\nobtain substantial speedups (up to several thousand) and even super-linear\nones, and we provide an in-depth analysis of our results to understand\nprecisely the superior performance of our second strategy.\n","authors":["David Redon","Pierre Fortin","Bilel Derbel","Miwako Tsuji","Mitsuhisa Sato"],"pdf_url":"https://arxiv.org/pdf/2409.11765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13787v2","updated":"2024-09-18T06:44:48Z","published":"2024-08-25T09:30:34Z","title":"Mask-Encoded Sparsification: Mitigating Biased Gradients in\n Communication-Efficient Split Learning","summary":" This paper introduces a novel framework designed to achieve a high\ncompression ratio in Split Learning (SL) scenarios where resource-constrained\ndevices are involved in large-scale model training. Our investigations\ndemonstrate that compressing feature maps within SL leads to biased gradients\nthat can negatively impact the convergence rates and diminish the\ngeneralization capabilities of the resulting models. Our theoretical analysis\nprovides insights into how compression errors critically hinder SL performance,\nwhich previous methodologies underestimate. To address these challenges, we\nemploy a narrow bit-width encoded mask to compensate for the sparsification\nerror without increasing the order of time complexity. Supported by rigorous\ntheoretical analysis, our framework significantly reduces compression errors\nand accelerates the convergence. Extensive experiments also verify that our\nmethod outperforms existing solutions regarding training efficiency and\ncommunication complexity.\n","authors":["Wenxuan Zhou","Zhihao Qu","Shen-Huan Lyu","Miao Cai","Baoliu Ye"],"pdf_url":"https://arxiv.org/pdf/2408.13787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11621v1","updated":"2024-09-18T00:56:14Z","published":"2024-09-18T00:56:14Z","title":"Blockchain-Enabled IoV: Secure Communication and Trustworthy\n Decision-Making","summary":" The Internet of Vehicles (IoV), which enables interactions between vehicles,\ninfrastructure, and the environment, faces challenges in maintaining\ncommunication security and reliable automated decisions. This paper introduces\na decentralized framework comprising a primary layer for managing inter-vehicle\ncommunication and a sub-layer for securing intra-vehicle interactions. By\nimplementing blockchain-based protocols like Blockchain-integrated Secure\nAuthentication (BiSA) and Decentralized Blockchain Name Resolution (DBNR), the\nframework ensures secure, decentralized identity management and reliable data\nexchanges, thereby supporting safe and efficient autonomous vehicle operations.\n","authors":["Jingyi Sun","Qi Shi","Guodong Jin","Hao Xu","Erwu Liu"],"pdf_url":"https://arxiv.org/pdf/2409.11621v1.pdf","comment":"The 2024 7th IEEE Conference on Dependable and Secure Computing"},{"id":"http://arxiv.org/abs/2409.08584v2","updated":"2024-09-18T22:02:00Z","published":"2024-09-13T07:03:01Z","title":"CompressedMediQ: Hybrid Quantum Machine Learning Pipeline for\n High-Dimensional Neuroimaging Data","summary":" This paper introduces CompressedMediQ, a novel hybrid quantum-classical\nmachine learning pipeline specifically developed to address the computational\nchallenges associated with high-dimensional multi-class neuroimaging data\nanalysis. Standard neuroimaging datasets, such as 4D MRI data from the\nAlzheimer's Disease Neuroimaging Initiative (ADNI) and Neuroimaging in\nFrontotemporal Dementia (NIFD), present significant hurdles due to their vast\nsize and complexity. CompressedMediQ integrates classical high-performance\ncomputing (HPC) nodes for advanced MRI pre-processing and Convolutional Neural\nNetwork (CNN)-PCA-based feature extraction and reduction, addressing the\nlimited-qubit availability for quantum data encoding in the NISQ (Noisy\nIntermediate-Scale Quantum) era. This is followed by Quantum Support Vector\nMachine (QSVM) classification. By utilizing quantum kernel methods, the\npipeline optimizes feature mapping and classification, enhancing data\nseparability and outperforming traditional neuroimaging analysis techniques.\nExperimental results highlight the pipeline's superior accuracy in dementia\nstaging, validating the practical use of quantum machine learning in clinical\ndiagnostics. Despite the limitations of NISQ devices, this proof-of-concept\ndemonstrates the transformative potential of quantum-enhanced learning, paving\nthe way for scalable and precise diagnostic tools in healthcare and signal\nprocessing.\n","authors":["Kuan-Cheng Chen","Yi-Tien Li","Tai-Yu Li","Chen-Yu Liu","Cheng-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2409.08584v2.pdf","comment":null}],"Programming and Languages":[{"id":"http://arxiv.org/abs/2409.12013v1","updated":"2024-09-18T14:28:19Z","published":"2024-09-18T14:28:19Z","title":"Memory Consistency and Program Transformations","summary":" A memory consistency model specifies the allowed behaviors of shared memory\nconcurrent programs. At the language level, these models are known to have a\nnon-trivial impact on the safety of program optimizations, limiting the ability\nto rearrange/refactor code without introducing new behaviors. Existing\nprogramming language memory models try to address this by permitting more\n(relaxed/weak) concurrent behaviors but are still unable to allow all the\ndesired optimizations. A core problem is that weaker consistency models may\nalso render optimizations unsafe, a conclusion that goes against the intuition\nof them allowing more behaviors. This exposes an open problem of the\ncompositional interaction between memory consistency semantics and\noptimizations: which parts of the semantics correspond to allowing/disallowing\nwhich set of optimizations is unclear. In this work, we establish a formal\nfoundation suitable enough to understand this compositional nature, decomposing\noptimizations into a finite set of elementary effects on program execution\ntraces, over which aspects of safety can be assessed. We use this decomposition\nto identify a desirable compositional property (complete) that would guarantee\nthe safety of optimizations from one memory model to another. We showcase its\npracticality by proving such a property between Sequential Consistency (SC) and\n$SC_{RR}$, the latter allowing independent read-read reordering over $SC$. Our\nwork potentially paves way to a new design methodology of programming-language\nmemory models, one that places emphasis on the optimizations desired to be\nperformed.\n","authors":["Akshay Gopalakrishnan","Clark Verbrugge","Mark Batty"],"pdf_url":"https://arxiv.org/pdf/2409.12013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14705v2","updated":"2024-09-18T08:57:59Z","published":"2024-07-19T23:42:28Z","title":"Reactive graphs in action (extended version)","summary":" Reactive graphs are transition structures whereas edges become active and\ninactive during its evolution, that were introduced by Dov Gabbay from a\nmathematical's perspective. This paper presents Marge\n(https://fm-dcc.github.io/MARGe), a web-based tool to visualise and analyse\nreactive graphs enriched with labels. Marge animates the operational semantics\nof reactive graphs and offers different graphical views to provide insights\nover concrete systems. We motivate the applicability of reactive graphs for\nadaptive systems and for featured transition systems, using Marge to tighten\nthe gap between the existing theoretical models and their usage to analyse\nconcrete systems.\n","authors":["David Tinoco","Alexandre Madeira","Manuel A. Martins","José Proença"],"pdf_url":"https://arxiv.org/pdf/2407.14705v2.pdf","comment":"Companion paper of an article accepted at FACS 2024 with a similar\n name"}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2409.11617v1","updated":"2024-09-18T00:44:50Z","published":"2024-09-18T00:44:50Z","title":"HRA: A Multi-Criteria Framework for Ranking Metaheuristic Optimization\n Algorithms","summary":" Metaheuristic algorithms are essential for solving complex optimization\nproblems in different fields. However, the difficulty in comparing and rating\nthese algorithms remains due to the wide range of performance metrics and\nproblem dimensions usually involved. On the other hand, nonparametric\nstatistical methods and post hoc tests are time-consuming, especially when we\nonly need to identify the top performers among many algorithms. The\nHierarchical Rank Aggregation (HRA) algorithm aims to efficiently rank\nmetaheuristic algorithms based on their performance across many criteria and\ndimensions. The HRA employs a hierarchical framework that begins with\ncollecting performance metrics on various benchmark functions and dimensions.\nRank-based normalization is employed for each performance measure to ensure\ncomparability and the robust TOPSIS aggregation is applied to combine these\nrankings at several hierarchical levels, resulting in a comprehensive ranking\nof the algorithms. Our study uses data from the CEC 2017 competition to\ndemonstrate the robustness and efficacy of the HRA framework. It examines 30\nbenchmark functions and evaluates the performance of 13 metaheuristic\nalgorithms across five performance indicators in four distinct dimensions. This\npresentation highlights the potential of the HRA to enhance the interpretation\nof the comparative advantages and disadvantages of various algorithms by\nsimplifying practitioners' choices of the most appropriate algorithm for\ncertain optimization problems.\n","authors":["Evgenia-Maria K. Goula","Dimitris G. Sotiropoulos"],"pdf_url":"https://arxiv.org/pdf/2409.11617v1.pdf","comment":"13 pages, 1 figure"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2409.12029v1","updated":"2024-09-18T14:45:23Z","published":"2024-09-18T14:45:23Z","title":"Biological arrow of time: Emergence of tangled information hierarchies\n and self-modelling dynamics","summary":" We study open-ended evolution by focusing on computational and\ninformation-processing dynamics underlying major evolutionary transitions. In\ndoing so, we consider biological organisms as hierarchical dynamical systems\nthat generate regularities in their phase-spaces through interactions with\ntheir environment. These emergent information patterns can then be encoded\nwithin the organism's components, leading to self-modelling \"tangled\nhierarchies\". Our main conjecture is that when macro-scale patterns are encoded\nwithin micro-scale components, it creates fundamental tensions (computational\ninconsistencies) between what is encodable at a particular evolutionary stage\nand what is potentially realisable in the environment. A resolution of these\ntensions triggers an evolutionary transition which expands the problem-space,\nat the cost of generating new tensions in the expanded space, in a continual\nprocess. We argue that biological complexification can be interpreted\ncomputation-theoretically, within the G\\\"odel--Turing--Post recursion-theoretic\nframework, as open-ended generation of computational novelty. In general, this\nprocess can be viewed as a meta-simulation performed by higher-order systems\nthat successively simulate the computation carried out by lower-order systems.\nThis computation-theoretic argument provides a basis for hypothesising the\nbiological arrow of time.\n","authors":["Mikhail Prokopenko","Paul C. W. Davies","Michael Harré","Marcus Heisler","Zdenka Kuncic","Geraint F. Lewis","Ori Livson","Joseph T. Lizier","Fernando E. Rosas"],"pdf_url":"https://arxiv.org/pdf/2409.12029v1.pdf","comment":"30 pages, 13 figures"},{"id":"http://arxiv.org/abs/2409.11999v1","updated":"2024-09-18T14:10:42Z","published":"2024-09-18T14:10:42Z","title":"On Randomized Computational Models and Complexity Classes: a Historical\n Overview","summary":" Since their appearance in the 1950s, computational models capable of\nperforming probabilistic choices have received wide attention and are nowadays\npervasive in almost every areas of computer science. Their development was also\ninextricably linked with inquiries about computation power and resource issues.\nAlthough most crucial notions in the field are well-known, the related\nterminology is sometimes imprecise or misleading. The present work aims to\nclarify the core features and main differences between machines and classes\ndeveloped in relation to randomized computation. To do so, we compare the\nmodern definitions with original ones, recalling the context in which they\nfirst appeared, and investigate the relations linking probabilistic and\ncounting models.\n","authors":["Melissa Antonelli","Ugo Dal Lago","Paolo Pistone"],"pdf_url":"https://arxiv.org/pdf/2409.11999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11946v1","updated":"2024-09-18T13:00:02Z","published":"2024-09-18T13:00:02Z","title":"An Imperative Language for Verified Exact Real-Number Computation","summary":" We introduce Clerical, a programming language for exact real-number\ncomputation that combines first-order imperative-style programming with a limit\noperator for computation of real numbers as limits of Cauchy sequences. We\naddress the semidecidability of the linear ordering of the reals by\nincorporating nondeterministic guarded choice, through which decisions based on\npartial comparison operations on reals can be patched together to give total\nprograms. The interplay between mutable state, nondeterminism, and computation\nof limits is controlled by the requirement that expressions computing limits\nand guards modify only local state. We devise a domain-theoretic denotational\nsemantics that uses a variant of Plotkin powerdomain construction tailored to\nour specific version of nondeterminism. We formulate a Hoare-style\nspecification logic, show that it is sound for the denotational semantics, and\nillustrate the setup by implementing and proving correct a program for\ncomputation of $\\pi$ as the least positive zero of $\\sin$. The modular\ncharacter of Clerical allows us to compose the program from smaller parts, each\nof which is shown to be correct on its own. We provide a proof-of-concept OCaml\nimplementation of Clerical, and formally verify parts of the development,\nnotably the soundness of specification logic, in the Coq proof assistant.\n","authors":["Andrej Bauer","Sewon Park","Alex Simpson"],"pdf_url":"https://arxiv.org/pdf/2409.11946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05680v3","updated":"2024-09-18T11:43:43Z","published":"2024-02-08T13:58:16Z","title":"Interpretable classifiers for tabular data via discretization and\n feature selection","summary":" We introduce a method for computing immediately human interpretable yet\naccurate classifiers from tabular data. The classifiers obtained are short\nBoolean formulas, computed via first discretizing the original data and then\nusing feature selection coupled with a very fast algorithm for producing the\nbest possible Boolean classifier for the setting. We demonstrate the approach\nvia 12 experiments, obtaining results with accuracies comparable to ones\nobtained via random forests, XGBoost, and existing results for the same\ndatasets in the literature. In most cases, the accuracy of our method is in\nfact similar to that of the reference methods, even though the main objective\nof our study is the immediate interpretability of our classifiers. We also\nprove a new result on the probability that the classifier we obtain from\nreal-life data corresponds to the ideally best classifier with respect to the\nbackground distribution the data comes from.\n","authors":["Reijo Jaakkola","Tomi Janhunen","Antti Kuusisto","Masood Feyzbakhsh Rankooh","Miikka Vilander"],"pdf_url":"https://arxiv.org/pdf/2402.05680v3.pdf","comment":"Preprint of a paper in DAO-XAI 2024 (Data meets Applied Ontologies in\n Explainable AI)"},{"id":"http://arxiv.org/abs/1906.04199v6","updated":"2024-09-18T08:09:05Z","published":"2019-05-15T11:35:35Z","title":"Synthesis of Computable Regular Functions of Infinite Words","summary":" Regular functions from infinite words to infinite words can be equivalently\nspecified by MSO-transducers, streaming $\\omega$-string transducers as well as\ndeterministic two-way transducers with look-ahead. In their one-way\nrestriction, the latter transducers define the class of rational functions.\nEven though regular functions are robustly characterised by several\nfinite-state devices, even the subclass of rational functions may contain\nfunctions which are not computable (by a Turing machine with infinite input).\nThis paper proposes a decision procedure for the following synthesis problem:\ngiven a regular function $f$ (equivalently specified by one of the\naforementioned transducer model), is $f$ computable and if it is, synthesize a\nTuring machine computing it.\n For regular functions, we show that computability is equivalent to\ncontinuity, and therefore the problem boils down to deciding continuity. We\nestablish a generic characterisation of continuity for functions preserving\nregular languages under inverse image (such as regular functions). We exploit\nthis characterisation to show the decidability of continuity (and hence\ncomputability) of rational and regular functions. For rational functions, we\nshow that this can be done in $\\mathsf{NLogSpace}$ (it was already known to be\nin $\\mathsf{PTime}$ by Prieur). In a similar fashion, we also effectively\ncharacterise uniform continuity of regular functions, and relate it to the\nnotion of uniform computability, which offers stronger efficiency guarantees.\n","authors":["V. Dave","E. Filiot","S. Krishna","N. Lhote"],"pdf_url":"https://arxiv.org/pdf/1906.04199v6.pdf","comment":null}]},"2024-09-19T00:00:00Z":{"Distributed, Parallel, and Cluster Computing":[{"id":"http://arxiv.org/abs/2409.05500v2","updated":"2024-09-19T08:01:24Z","published":"2024-09-09T10:52:58Z","title":"Optimizing VarLiNGAM for Scalable and Efficient Time Series Causal\n Discovery","summary":" Causal discovery identifies causal relationships in data, but the task is\nmore complex for multivariate time series due to the computational demands of\nmethods like VarLiNGAM, which combines a Vector Autoregressive Model with a\nLinear Non-Gaussian Acyclic Model. This study optimizes causal discovery\nspecifically for time series data, which are common in practical applications.\nTime series causal discovery is particularly challenging because of temporal\ndependencies and potential time lag effects. By developing a specialized\ndataset generator and reducing the computational complexity of the VarLiNGAM\nmodel from \\( O(m^3 \\cdot n) \\) to \\( O(m^3 + m^2 \\cdot n) \\), this study\nenhances the feasibility of processing large datasets. The proposed methods\nwere validated on advanced computational platforms and tested on simulated,\nreal-world, and large-scale datasets, demonstrating improved efficiency and\nperformance. The optimized algorithm achieved 7 to 13 times speedup compared to\nthe original and about 4.5 times speedup compared to the GPU-accelerated\nversion on large-scale datasets with feature sizes from 200 to 400. Our methods\nextend current causal discovery capabilities, making them more robust,\nscalable, and applicable to real-world scenarios, facilitating advancements in\nfields like healthcare and finance.\n","authors":["Ziyang Jiao","Ce Guo","Wayne Luk"],"pdf_url":"https://arxiv.org/pdf/2409.05500v2.pdf","comment":null}],"Performance Profiling":[{"id":"http://arxiv.org/abs/2409.05500v2","updated":"2024-09-19T08:01:24Z","published":"2024-09-09T10:52:58Z","title":"Optimizing VarLiNGAM for Scalable and Efficient Time Series Causal\n Discovery","summary":" Causal discovery identifies causal relationships in data, but the task is\nmore complex for multivariate time series due to the computational demands of\nmethods like VarLiNGAM, which combines a Vector Autoregressive Model with a\nLinear Non-Gaussian Acyclic Model. This study optimizes causal discovery\nspecifically for time series data, which are common in practical applications.\nTime series causal discovery is particularly challenging because of temporal\ndependencies and potential time lag effects. By developing a specialized\ndataset generator and reducing the computational complexity of the VarLiNGAM\nmodel from \\( O(m^3 \\cdot n) \\) to \\( O(m^3 + m^2 \\cdot n) \\), this study\nenhances the feasibility of processing large datasets. The proposed methods\nwere validated on advanced computational platforms and tested on simulated,\nreal-world, and large-scale datasets, demonstrating improved efficiency and\nperformance. The optimized algorithm achieved 7 to 13 times speedup compared to\nthe original and about 4.5 times speedup compared to the GPU-accelerated\nversion on large-scale datasets with feature sizes from 200 to 400. Our methods\nextend current causal discovery capabilities, making them more robust,\nscalable, and applicable to real-world scenarios, facilitating advancements in\nfields like healthcare and finance.\n","authors":["Ziyang Jiao","Ce Guo","Wayne Luk"],"pdf_url":"https://arxiv.org/pdf/2409.05500v2.pdf","comment":null}],"Formal Languages and Automata Theory":[{"id":"http://arxiv.org/abs/2409.07077v3","updated":"2024-09-19T09:25:03Z","published":"2024-09-11T08:02:06Z","title":"Submonoid Membership in n-dimensional lamplighter groups and S-unit\n equations","summary":" We show that Submonoid Membership is decidable in n-dimensional lamplighter\ngroups $(\\mathbb{Z}/p\\mathbb{Z}) \\wr \\mathbb{Z}^n$ for any prime $p$ and\ninteger $n$. More generally, we show decidability of Submonoid Membership in\nsemidirect products of the form $\\mathcal{Y} \\rtimes \\mathbb{Z}^n$, where\n$\\mathcal{Y}$ is any finitely presented module over the Laurent polynomial ring\n$\\mathbb{F}_p[X_1^{\\pm}, \\ldots, X_n^{\\pm}]$. Combined with a result of Shafrir\n(2024), this gives the first example of a group $G$ and a finite index subgroup\n$\\widetilde{G} \\leq G$, such that Submonoid Membership is decidable in\n$\\widetilde{G}$ but undecidable in $G$.\n To obtain our decidability result, we reduce Submonoid Membership in\n$\\mathcal{Y} \\rtimes \\mathbb{Z}^n$ to solving S-unit equations over\n$\\mathbb{F}_p[X_1^{\\pm}, \\ldots, X_n^{\\pm}]$-modules. We show that the solution\nset of such equations is effectively $p$-automatic, extending a result of\nAdamczewski and Bell (2012). As an intermediate result, we also obtain that the\nsolution set of the Knapsack Problem in $\\mathcal{Y} \\rtimes \\mathbb{Z}^n$ is\neffectively $p$-automatic.\n","authors":["Ruiwen Dong"],"pdf_url":"https://arxiv.org/pdf/2409.07077v3.pdf","comment":"Added funding information, 21 pages"},{"id":"http://arxiv.org/abs/2409.05498v2","updated":"2024-09-19T05:47:59Z","published":"2024-09-09T10:49:54Z","title":"Deciding the synthesis problem for hybrid games through bisimulation","summary":" Hybrid games are games played on a finite graph endowed with real variables\nwhich may model behaviors of discrete controllers of continuous systems. The\nsynthesis problem for hybrid games is decidable for classical objectives (like\nLTL formulas) when the games are initialized singular, meaning that the slopes\nof the continuous variables are piecewise constant and variables are reset\nwhenever their slope changes. The known proof adapts the region construction\nfrom timed games. In this paper we show that initialized singular games can be\nreduced, via a sequence of alternating bisimulations, to timed games,\ngeneralizing the known reductions by bisimulation from initialized singular\nautomata to timed automata. Alternating bisimulation is the generalization of\nbisimulation to games, accomodating a strategy translation lemma by which, when\ntwo games are bisimilar and carry the same observations, each strategy in one\nof the games can be translated to a strategy in the second game such that all\nthe outcomes of the second strategy satisfies the same property that are\nsatisfied by the first strategy. The advantage of the proposed approach is that\none may then use realizability tools for timed games to synthesize a winning\nstrategy for a given objective, and then use the strategy translation lemma to\nobtain a winning strategy in the hybrid game for the same objective.\n","authors":["Catalin Dima","Mariem Hammami","Youssouf Oualhadj","Régine Laleau"],"pdf_url":"https://arxiv.org/pdf/2409.05498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12240v2","updated":"2024-09-19T12:21:39Z","published":"2024-08-22T09:17:59Z","title":"The Bright Side of Timed Opacity","summary":" In 2009, Franck Cassez showed that the timed opacity problem, where an\nattacker can observe some actions with their timestamps and attempts to deduce\ninformation, is undecidable for timed automata (TAs). Moreover, he showed that\nthe undecidability holds even for subclasses such as event-recording automata.\nIn this article, we consider the same definition of opacity for several other\nsubclasses of TAs: with restrictions on the number of clocks, of actions, on\nthe nature of time, or on a new subclass called observable event-recording\nautomata. We show that opacity can mostly be retrieved, except for one-action\nTAs and for one-clock TAs with $\\epsilon$-transitions, for which undecidability\nremains. We then exhibit a new decidable subclass in which the number of\nobservations made by the attacker is limited.\n","authors":["Étienne André","Sarah Dépernet","Engel Lefaucheux"],"pdf_url":"https://arxiv.org/pdf/2408.12240v2.pdf","comment":"This is the author (and extended) version of the manuscript of the\n same name published in the proceedings of the 25th International Conference\n on Formal Engineering Methods (ICFEM 2024)"}],"Logic in Computer Science":[{"id":"http://arxiv.org/abs/2409.05498v2","updated":"2024-09-19T05:47:59Z","published":"2024-09-09T10:49:54Z","title":"Deciding the synthesis problem for hybrid games through bisimulation","summary":" Hybrid games are games played on a finite graph endowed with real variables\nwhich may model behaviors of discrete controllers of continuous systems. The\nsynthesis problem for hybrid games is decidable for classical objectives (like\nLTL formulas) when the games are initialized singular, meaning that the slopes\nof the continuous variables are piecewise constant and variables are reset\nwhenever their slope changes. The known proof adapts the region construction\nfrom timed games. In this paper we show that initialized singular games can be\nreduced, via a sequence of alternating bisimulations, to timed games,\ngeneralizing the known reductions by bisimulation from initialized singular\nautomata to timed automata. Alternating bisimulation is the generalization of\nbisimulation to games, accomodating a strategy translation lemma by which, when\ntwo games are bisimilar and carry the same observations, each strategy in one\nof the games can be translated to a strategy in the second game such that all\nthe outcomes of the second strategy satisfies the same property that are\nsatisfied by the first strategy. The advantage of the proposed approach is that\none may then use realizability tools for timed games to synthesize a winning\nstrategy for a given objective, and then use the strategy translation lemma to\nobtain a winning strategy in the hybrid game for the same objective.\n","authors":["Catalin Dima","Mariem Hammami","Youssouf Oualhadj","Régine Laleau"],"pdf_url":"https://arxiv.org/pdf/2409.05498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12240v2","updated":"2024-09-19T12:21:39Z","published":"2024-08-22T09:17:59Z","title":"The Bright Side of Timed Opacity","summary":" In 2009, Franck Cassez showed that the timed opacity problem, where an\nattacker can observe some actions with their timestamps and attempts to deduce\ninformation, is undecidable for timed automata (TAs). Moreover, he showed that\nthe undecidability holds even for subclasses such as event-recording automata.\nIn this article, we consider the same definition of opacity for several other\nsubclasses of TAs: with restrictions on the number of clocks, of actions, on\nthe nature of time, or on a new subclass called observable event-recording\nautomata. We show that opacity can mostly be retrieved, except for one-action\nTAs and for one-clock TAs with $\\epsilon$-transitions, for which undecidability\nremains. We then exhibit a new decidable subclass in which the number of\nobservations made by the attacker is limited.\n","authors":["Étienne André","Sarah Dépernet","Engel Lefaucheux"],"pdf_url":"https://arxiv.org/pdf/2408.12240v2.pdf","comment":"This is the author (and extended) version of the manuscript of the\n same name published in the proceedings of the 25th International Conference\n on Formal Engineering Methods (ICFEM 2024)"}],"Computational Complexity":[{"id":"http://arxiv.org/abs/2408.10114v4","updated":"2024-09-19T15:42:03Z","published":"2024-08-19T16:01:21Z","title":"Topics in Non-local Games: Synchronous Algebras, Algebraic Graph\n Identities, and Quantum NP-hardness Reductions","summary":" We review the correspondence between synchronous games and their associated\n$*$-algebra. Building upon the work of (Helton et al., New York J. Math. 2017),\nwe propose results on algebraic and locally commuting graph identities. Based\non the noncommutative Nullstellens\\\"atze (Watts, Helton and Klep, Annales Henri\nPoincar\\'e 2023), we build computational tools that check the non-existence of\nperfect $C^*$ and algebraic strategies of synchronous games using Gr\\\"obner\nbasis methods and semidefinite programming. We prove the equivalence between\nthe hereditary and $C^*$ models questioned in (Helton et al., New York J. Math.\n2017). We also extend the quantum-version NP-hardness reduction\n$\\texttt{3-Coloring}^* \\leq_p \\texttt{3-SAT}^*$ due to (Ji, arXiv 2013) by\nexhibiting another instance of such reduction $\\texttt{Clique}^* \\leq_p\n\\texttt{3-SAT}^*$.\n","authors":["Entong He"],"pdf_url":"https://arxiv.org/pdf/2408.10114v4.pdf","comment":"21 pages. Research conducted under the supervision of Dr. Connor\n Paddock and Prof. Anne Broadbent"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..b115d9ed --- /dev/null +++ b/index.html @@ -0,0 +1,9196 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Distributed, Parallel, and Cluster Computing 1 + +
+
+
+ + ♻ ☆ Optimizing VarLiNGAM for Scalable and Efficient Time Series Causal + Discovery + + +
+ Causal discovery identifies causal relationships in data, but the task is +more complex for multivariate time series due to the computational demands of +methods like VarLiNGAM, which combines a Vector Autoregressive Model with a +Linear Non-Gaussian Acyclic Model. This study optimizes causal discovery +specifically for time series data, which are common in practical applications. +Time series causal discovery is particularly challenging because of temporal +dependencies and potential time lag effects. By developing a specialized +dataset generator and reducing the computational complexity of the VarLiNGAM +model from \( O(m^3 \cdot n) \) to \( O(m^3 + m^2 \cdot n) \), this study +enhances the feasibility of processing large datasets. The proposed methods +were validated on advanced computational platforms and tested on simulated, +real-world, and large-scale datasets, demonstrating improved efficiency and +performance. The optimized algorithm achieved 7 to 13 times speedup compared to +the original and about 4.5 times speedup compared to the GPU-accelerated +version on large-scale datasets with feature sizes from 200 to 400. Our methods +extend current causal discovery capabilities, making them more robust, +scalable, and applicable to real-world scenarios, facilitating advancements in +fields like healthcare and finance. + +
+
+
+
+
+
+
+
+ + Performance Profiling 1 + +
+
+
+ + ♻ ☆ Optimizing VarLiNGAM for Scalable and Efficient Time Series Causal + Discovery + + +
+ Causal discovery identifies causal relationships in data, but the task is +more complex for multivariate time series due to the computational demands of +methods like VarLiNGAM, which combines a Vector Autoregressive Model with a +Linear Non-Gaussian Acyclic Model. This study optimizes causal discovery +specifically for time series data, which are common in practical applications. +Time series causal discovery is particularly challenging because of temporal +dependencies and potential time lag effects. By developing a specialized +dataset generator and reducing the computational complexity of the VarLiNGAM +model from \( O(m^3 \cdot n) \) to \( O(m^3 + m^2 \cdot n) \), this study +enhances the feasibility of processing large datasets. The proposed methods +were validated on advanced computational platforms and tested on simulated, +real-world, and large-scale datasets, demonstrating improved efficiency and +performance. The optimized algorithm achieved 7 to 13 times speedup compared to +the original and about 4.5 times speedup compared to the GPU-accelerated +version on large-scale datasets with feature sizes from 200 to 400. Our methods +extend current causal discovery capabilities, making them more robust, +scalable, and applicable to real-world scenarios, facilitating advancements in +fields like healthcare and finance. + +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 3 + +
+
+
+ + ♻ ☆ Submonoid Membership in n-dimensional lamplighter groups and S-unit + equations + + +
+ We show that Submonoid Membership is decidable in n-dimensional lamplighter +groups $(\mathbb{Z}/p\mathbb{Z}) \wr \mathbb{Z}^n$ for any prime $p$ and +integer $n$. More generally, we show decidability of Submonoid Membership in +semidirect products of the form $\mathcal{Y} \rtimes \mathbb{Z}^n$, where +$\mathcal{Y}$ is any finitely presented module over the Laurent polynomial ring +$\mathbb{F}_p[X_1^{\pm}, \ldots, X_n^{\pm}]$. Combined with a result of Shafrir +(2024), this gives the first example of a group $G$ and a finite index subgroup +$\widetilde{G} \leq G$, such that Submonoid Membership is decidable in +$\widetilde{G}$ but undecidable in $G$. + To obtain our decidability result, we reduce Submonoid Membership in +$\mathcal{Y} \rtimes \mathbb{Z}^n$ to solving S-unit equations over +$\mathbb{F}_p[X_1^{\pm}, \ldots, X_n^{\pm}]$-modules. We show that the solution +set of such equations is effectively $p$-automatic, extending a result of +Adamczewski and Bell (2012). As an intermediate result, we also obtain that the +solution set of the Knapsack Problem in $\mathcal{Y} \rtimes \mathbb{Z}^n$ is +effectively $p$-automatic. + +
+
+ comment: Added funding information, 21 pages +
+
+
+
+
+ + ♻ ☆ Deciding the synthesis problem for hybrid games through bisimulation + + +
+ Hybrid games are games played on a finite graph endowed with real variables +which may model behaviors of discrete controllers of continuous systems. The +synthesis problem for hybrid games is decidable for classical objectives (like +LTL formulas) when the games are initialized singular, meaning that the slopes +of the continuous variables are piecewise constant and variables are reset +whenever their slope changes. The known proof adapts the region construction +from timed games. In this paper we show that initialized singular games can be +reduced, via a sequence of alternating bisimulations, to timed games, +generalizing the known reductions by bisimulation from initialized singular +automata to timed automata. Alternating bisimulation is the generalization of +bisimulation to games, accomodating a strategy translation lemma by which, when +two games are bisimilar and carry the same observations, each strategy in one +of the games can be translated to a strategy in the second game such that all +the outcomes of the second strategy satisfies the same property that are +satisfied by the first strategy. The advantage of the proposed approach is that +one may then use realizability tools for timed games to synthesize a winning +strategy for a given objective, and then use the strategy translation lemma to +obtain a winning strategy in the hybrid game for the same objective. + +
+
+
+
+
+ + ♻ ☆ The Bright Side of Timed Opacity + + +
+ In 2009, Franck Cassez showed that the timed opacity problem, where an +attacker can observe some actions with their timestamps and attempts to deduce +information, is undecidable for timed automata (TAs). Moreover, he showed that +the undecidability holds even for subclasses such as event-recording automata. +In this article, we consider the same definition of opacity for several other +subclasses of TAs: with restrictions on the number of clocks, of actions, on +the nature of time, or on a new subclass called observable event-recording +automata. We show that opacity can mostly be retrieved, except for one-action +TAs and for one-clock TAs with $\epsilon$-transitions, for which undecidability +remains. We then exhibit a new decidable subclass in which the number of +observations made by the attacker is limited. + +
+
+ comment: This is the author (and extended) version of the manuscript of the + same name published in the proceedings of the 25th International Conference + on Formal Engineering Methods (ICFEM 2024) +
+
+
+
+
+
+
+
+ + Logic in Computer Science 2 + +
+
+
+ + ♻ ☆ Deciding the synthesis problem for hybrid games through bisimulation + + +
+ Hybrid games are games played on a finite graph endowed with real variables +which may model behaviors of discrete controllers of continuous systems. The +synthesis problem for hybrid games is decidable for classical objectives (like +LTL formulas) when the games are initialized singular, meaning that the slopes +of the continuous variables are piecewise constant and variables are reset +whenever their slope changes. The known proof adapts the region construction +from timed games. In this paper we show that initialized singular games can be +reduced, via a sequence of alternating bisimulations, to timed games, +generalizing the known reductions by bisimulation from initialized singular +automata to timed automata. Alternating bisimulation is the generalization of +bisimulation to games, accomodating a strategy translation lemma by which, when +two games are bisimilar and carry the same observations, each strategy in one +of the games can be translated to a strategy in the second game such that all +the outcomes of the second strategy satisfies the same property that are +satisfied by the first strategy. The advantage of the proposed approach is that +one may then use realizability tools for timed games to synthesize a winning +strategy for a given objective, and then use the strategy translation lemma to +obtain a winning strategy in the hybrid game for the same objective. + +
+
+
+
+
+ + ♻ ☆ The Bright Side of Timed Opacity + + +
+ In 2009, Franck Cassez showed that the timed opacity problem, where an +attacker can observe some actions with their timestamps and attempts to deduce +information, is undecidable for timed automata (TAs). Moreover, he showed that +the undecidability holds even for subclasses such as event-recording automata. +In this article, we consider the same definition of opacity for several other +subclasses of TAs: with restrictions on the number of clocks, of actions, on +the nature of time, or on a new subclass called observable event-recording +automata. We show that opacity can mostly be retrieved, except for one-action +TAs and for one-clock TAs with $\epsilon$-transitions, for which undecidability +remains. We then exhibit a new decidable subclass in which the number of +observations made by the attacker is limited. + +
+
+ comment: This is the author (and extended) version of the manuscript of the + same name published in the proceedings of the 25th International Conference + on Formal Engineering Methods (ICFEM 2024) +
+
+
+
+
+
+
+
+ + Computational Complexity 1 + +
+
+
+ + ♻ ☆ Topics in Non-local Games: Synchronous Algebras, Algebraic Graph + Identities, and Quantum NP-hardness Reductions + + +
+ We review the correspondence between synchronous games and their associated +$*$-algebra. Building upon the work of (Helton et al., New York J. Math. 2017), +we propose results on algebraic and locally commuting graph identities. Based +on the noncommutative Nullstellens\"atze (Watts, Helton and Klep, Annales Henri +Poincar\'e 2023), we build computational tools that check the non-existence of +perfect $C^*$ and algebraic strategies of synchronous games using Gr\"obner +basis methods and semidefinite programming. We prove the equivalence between +the hereditary and $C^*$ models questioned in (Helton et al., New York J. Math. +2017). We also extend the quantum-version NP-hardness reduction +$\texttt{3-Coloring}^* \leq_p \texttt{3-SAT}^*$ due to (Ji, arXiv 2013) by +exhibiting another instance of such reduction $\texttt{Clique}^* \leq_p +\texttt{3-SAT}^*$. + +
+
+ comment: 21 pages. Research conducted under the supervision of Dr. Connor + Paddock and Prof. Anne Broadbent +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computational Complexity 4 + +
+
+
+ + ☆ Computational Dynamical Systems + + +
+ We study the computational complexity theory of smooth, finite-dimensional +dynamical systems. Building off of previous work, we give definitions for what +it means for a smooth dynamical system to simulate a Turing machine. We then +show that 'chaotic' dynamical systems (more precisely, Axiom A systems) and +'integrable' dynamical systems (more generally, measure-preserving systems) +cannot robustly simulate universal Turing machines, although such machines can +be robustly simulated by other kinds of dynamical systems. Subsequently, we +show that any Turing machine that can be encoded into a structurally stable +one-dimensional dynamical system must have a decidable halting problem, and +moreover an explicit time complexity bound in instances where it does halt. +More broadly, our work elucidates what it means for one 'machine' to simulate +another, and emphasizes the necessity of defining low-complexity 'encoders' and +'decoders' to translate between the dynamics of the simulation and the system +being simulated. We highlight how the notion of a computational dynamical +system leads to questions at the intersection of computational complexity +theory, dynamical systems theory, and real algebraic geometry. + +
+
+ comment: 46+14 pages, 6 figures; accepted to FOCS 2024 +
+
+
+
+
+ + ♻ ☆ Partitioning Problems with Splittings and Interval Targets + + +
+ The $n$-way number partitioning problem is a classic problem in combinatorial +optimization, with applications to diverse settings such as fair allocation and +machine scheduling. All these problems are NP-hard, but various approximation +algorithms are known. We consider three closely related kinds of +approximations. + The first two variants optimize the partition such that: in the first variant +some fixed number $s$ of items can be \emph{split} between two or more bins and +in the second variant we allow at most a fixed number $t$ of \emph{splittings}. +The third variant is a decision problem: the largest bin sum must be within a +pre-specified interval, parameterized by a fixed rational number $u$ times the +largest item size. + When the number of bins $n$ is unbounded, we show that every variant is +strongly {\sf NP}-complete. When the number of bins $n$ is fixed, the running +time depends on the fixed parameters $s,t,u$. For each variant, we give a +complete picture of its running time. + For $n=2$, the running time is easy to identify. Our main results consider +any fixed integer $n \geq 3$. Using a two-way polynomial-time reduction between +the first and the third variant, we show that $n$-way number-partitioning with +$s$ split items can be solved in polynomial time if $s \geq n-2$, and it is +{\sf NP}-complete otherwise. Also, $n$-way number-partitioning with $t$ +splittings can be solved in polynomial time if $t \geq n-1$, and it is {\sf +NP}-complete otherwise. Finally, we show that the third variant can be solved +in polynomial time if $u \geq (n-2)/n$, and it is {\sf NP}-complete otherwise. +Our positive results for the optimization problems consider both min-max and +max-min versions. + Using the same reduction, we provide a fully polynomial-time approximation +scheme for the case where the number of split items is lower than $n-2$. + +
+
+
+
+
+ + ♻ ☆ Character Complexity: A Novel Measure for Quantum Circuit Analysis + + +
+ In the rapidly evolving field of quantum computing, quantifying circuit +complexity remains a critical challenge. This paper introduces Character +Complexity, a novel measure that bridges Group-theoretic concepts with +practical quantum computing concerns. By leveraging tools from representation +theory, I prove several key properties of character complexity and establish a +surprising connection to the classical simulability of quantum circuits. This +new measure offers a fresh perspective on the complexity landscape of quantum +algorithms, potentially reshaping our understanding of quantum-classical +computational boundaries. I present innovative visualization methods for +character complexity, providing intuitive insights into the structure of +quantum circuits. The empirical results reveal intriguing scaling behaviors +with respect to qubit and gate counts, opening new avenues for quantum +algorithm design and optimization. This work not only contributes to the +theoretical foundations of quantum complexity but also offers practical tools +for the quantum computing community. As quantum hardware continues to advance, +character complexity could play a crucial role in developing more efficient +quantum algorithms and in exploring the fundamental limits of quantum +computation. + +
+
+
+
+
+ + ♻ ☆ NP-completeness of Tiling Finite Simply Connected Regions with a Fixed + Set of Wang Tiles + + +
+ The computational complexity of tiling finite simply connected regions with a +fixed set of tiles is studied in this paper. We show that the problem of tiling +simply connected regions with a fixed set of $23$ Wang tiles is NP-complete. As +a consequence, the problem of tiling simply connected regions with a fixed set +of $111$ rectangles is NP-complete. Our results improve that of Igor Pak and +Jed Yang by using fewer numbers of tiles. Notably in the case of Wang tiles, +the number has decreased by more than one third from $35$ to $23$. + +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 6 + +
+
+
+ + ☆ Computational Dynamical Systems + + +
+ We study the computational complexity theory of smooth, finite-dimensional +dynamical systems. Building off of previous work, we give definitions for what +it means for a smooth dynamical system to simulate a Turing machine. We then +show that 'chaotic' dynamical systems (more precisely, Axiom A systems) and +'integrable' dynamical systems (more generally, measure-preserving systems) +cannot robustly simulate universal Turing machines, although such machines can +be robustly simulated by other kinds of dynamical systems. Subsequently, we +show that any Turing machine that can be encoded into a structurally stable +one-dimensional dynamical system must have a decidable halting problem, and +moreover an explicit time complexity bound in instances where it does halt. +More broadly, our work elucidates what it means for one 'machine' to simulate +another, and emphasizes the necessity of defining low-complexity 'encoders' and +'decoders' to translate between the dynamics of the simulation and the system +being simulated. We highlight how the notion of a computational dynamical +system leads to questions at the intersection of computational complexity +theory, dynamical systems theory, and real algebraic geometry. + +
+
+ comment: 46+14 pages, 6 figures; accepted to FOCS 2024 +
+
+
+
+
+ + ☆ The repetition threshold for ternary rich words + + +
+ In 2014, Vesti proposed the problem of determining the repetition threshold +for infinite rich words, i.e., for infinite words in which all factors of +length $n$ contain $n$ distinct nonempty palindromic factors. In 2020, Currie, +Mol, and Rampersad proved a conjecture of Baranwal and Shallit that the +repetition threshold for binary rich words is $2 + \sqrt{2}/2$. In this paper, +we prove a structure theorem for $16/7$-power-free ternary rich words. Using +the structure theorem, we deduce that the repetition threshold for ternary rich +words is $1 + 1/(3 - \mu) \approx 2.25876324$, where $\mu$ is the unique real +root of the polynomial $x^3 - 2x^2 - 1$. + +
+
+ comment: 60 pages +
+
+
+
+
+ + ☆ Biological arrow of time: Emergence of tangled information hierarchies + and self-modelling dynamics + + +
+ We study open-ended evolution by focusing on computational and +information-processing dynamics underlying major evolutionary transitions. In +doing so, we consider biological organisms as hierarchical dynamical systems +that generate regularities in their phase-spaces through interactions with +their environment. These emergent information patterns can then be encoded +within the organism's components, leading to self-modelling "tangled +hierarchies". Our main conjecture is that when macro-scale patterns are encoded +within micro-scale components, it creates fundamental tensions (computational +inconsistencies) between what is encodable at a particular evolutionary stage +and what is potentially realisable in the environment. A resolution of these +tensions triggers an evolutionary transition which expands the problem-space, +at the cost of generating new tensions in the expanded space, in a continual +process. We argue that biological complexification can be interpreted +computation-theoretically, within the G\"odel--Turing--Post recursion-theoretic +framework, as open-ended generation of computational novelty. In general, this +process can be viewed as a meta-simulation performed by higher-order systems +that successively simulate the computation carried out by lower-order systems. +This computation-theoretic argument provides a basis for hypothesising the +biological arrow of time. + +
+
+ comment: 30 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Reactive graphs in action (extended version) + + +
+ Reactive graphs are transition structures whereas edges become active and +inactive during its evolution, that were introduced by Dov Gabbay from a +mathematical's perspective. This paper presents Marge +(https://fm-dcc.github.io/MARGe), a web-based tool to visualise and analyse +reactive graphs enriched with labels. Marge animates the operational semantics +of reactive graphs and offers different graphical views to provide insights +over concrete systems. We motivate the applicability of reactive graphs for +adaptive systems and for featured transition systems, using Marge to tighten +the gap between the existing theoretical models and their usage to analyse +concrete systems. + +
+
+ comment: Companion paper of an article accepted at FACS 2024 with a similar + name +
+
+
+
+
+ + ♻ ☆ Specify What? Enhancing Neural Specification Synthesis by Symbolic + Methods + + +
+ We investigate how combinations of Large Language Models (LLMs) and symbolic +analyses can be used to synthesise specifications of C programs. The LLM +prompts are augmented with outputs from two formal methods tools in the Frama-C +ecosystem, Pathcrawler and EVA, to produce C program annotations in the +specification language ACSL. We demonstrate how the addition of symbolic +analysis to the workflow impacts the quality of annotations: information about +input/output examples from Pathcrawler produce more context-aware annotations, +while the inclusion of EVA reports yields annotations more attuned to runtime +errors. In addition, we show that the method infers rather the programs intent +than its behaviour, by generating specifications for buggy programs and +observing robustness of the result against bugs. + +
+
+
+
+
+ + ♻ ☆ Synthesis of Computable Regular Functions of Infinite Words + + +
+ Regular functions from infinite words to infinite words can be equivalently +specified by MSO-transducers, streaming $\omega$-string transducers as well as +deterministic two-way transducers with look-ahead. In their one-way +restriction, the latter transducers define the class of rational functions. +Even though regular functions are robustly characterised by several +finite-state devices, even the subclass of rational functions may contain +functions which are not computable (by a Turing machine with infinite input). +This paper proposes a decision procedure for the following synthesis problem: +given a regular function $f$ (equivalently specified by one of the +aforementioned transducer model), is $f$ computable and if it is, synthesize a +Turing machine computing it. + For regular functions, we show that computability is equivalent to +continuity, and therefore the problem boils down to deciding continuity. We +establish a generic characterisation of continuity for functions preserving +regular languages under inverse image (such as regular functions). We exploit +this characterisation to show the decidability of continuity (and hence +computability) of rational and regular functions. For rational functions, we +show that this can be done in $\mathsf{NLogSpace}$ (it was already known to be +in $\mathsf{PTime}$ by Prieur). In a similar fashion, we also effectively +characterise uniform continuity of regular functions, and relate it to the +notion of uniform computability, which offers stronger efficiency guarantees. + +
+
+
+
+
+
+
+
+ + Hardware Architecturea 2 + +
+
+
+ + ☆ Hardware-Friendly Implementation of Physical Reservoir Computing with + CMOS-based Time-domain Analog Spiking Neurons + + +
+ This paper introduces an analog spiking neuron that utilizes time-domain +information, i.e., a time interval of two signal transitions and a pulse width, +to construct a spiking neural network (SNN) for a hardware-friendly physical +reservoir computing (RC) on a complementary metal-oxide-semiconductor (CMOS) +platform. A neuron with leaky integrate-and-fire is realized by employing two +voltage-controlled oscillators (VCOs) with opposite sensitivities to the +internal control voltage, and the neuron connection structure is restricted by +the use of only 4 neighboring neurons on the 2-dimensional plane to feasibly +construct a regular network topology. Such a system enables us to compose an +SNN with a counter-based readout circuit, which simplifies the hardware +implementation of the SNN. Moreover, another technical advantage thanks to the +bottom-up integration is the capability of dynamically capturing every neuron +state in the network, which can significantly contribute to finding guidelines +on how to enhance the performance for various computational tasks in temporal +information processing. Diverse nonlinear physical dynamics needed for RC can +be realized by collective behavior through dynamic interaction between neurons, +like coupled oscillators, despite the simple network structure. With behavioral +system-level simulations, we demonstrate physical RC through short-term memory +and exclusive OR tasks, and the spoken digit recognition task with an accuracy +of 97.7% as well. Our system is considerably feasible for practical +applications and also can be a useful platform for studying the mechanism of +physical RC. + +
+
+
+
+
+ + ♻ ☆ WAGONN: Weight Bit Agglomeration in Crossbar Arrays for Reduced Impact + of Interconnect Resistance on DNN Inference Accuracy + + +
+ Deep neural network (DNN) accelerators employing crossbar arrays capable of +in-memory computing (IMC) are highly promising for neural computing platforms. +However, in deeply scaled technologies, interconnect resistance severely +impairs IMC robustness, leading to a drop in the system accuracy. To address +this problem, we propose SWANN - a technique based on shuffling weights in +crossbar arrays which alleviates the detrimental effect of wire resistance on +IMC. For 8T-SRAM-based 128x128 crossbar arrays in 7nm technology, SWANN +enhances the accuracy from 47.78% to 83.5% for ResNet-20/CIFAR-10. We also show +that SWANN can be used synergistically with Partial-Word-LineActivation, +further boosting the accuracy. Moreover, we evaluate the implications of SWANN +for compact ferroelectric-transistorbased crossbar arrays. SWANN incurs minimal +hardware overhead, with less than a 1% increase in energy consumption. +Additionally, the latency and area overheads of SWANN are ~1% and ~16%, +respectively when 1 ADC is utilized per crossbar array. + +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 4 + +
+
+
+ + ☆ Massively parallel CMA-ES with increasing population + + +
+ The Increasing Population Covariance Matrix Adaptation Evolution Strategy +(IPOP-CMA-ES) algorithm is a reference stochastic optimizer dedicated to +blackbox optimization, where no prior knowledge about the underlying problem +structure is available. This paper aims at accelerating IPOP-CMA-ES thanks to +high performance computing and parallelism when solving large optimization +problems. We first show how BLAS and LAPACK routines can be introduced in +linear algebra operations, and we then propose two strategies for deploying +IPOP-CMA-ES efficiently on large-scale parallel architectures with thousands of +CPU cores. The first parallel strategy processes the multiple searches in the +same ordering as the sequential IPOP-CMA-ES, while the second one processes +concurrently these multiple searches. These strategies are implemented in +MPI+OpenMP and compared on 6144 cores of the supercomputer Fugaku. We manage to +obtain substantial speedups (up to several thousand) and even super-linear +ones, and we provide an in-depth analysis of our results to understand +precisely the superior performance of our second strategy. + +
+
+
+
+
+ + ☆ Blockchain-Enabled IoV: Secure Communication and Trustworthy + Decision-Making + + +
+ The Internet of Vehicles (IoV), which enables interactions between vehicles, +infrastructure, and the environment, faces challenges in maintaining +communication security and reliable automated decisions. This paper introduces +a decentralized framework comprising a primary layer for managing inter-vehicle +communication and a sub-layer for securing intra-vehicle interactions. By +implementing blockchain-based protocols like Blockchain-integrated Secure +Authentication (BiSA) and Decentralized Blockchain Name Resolution (DBNR), the +framework ensures secure, decentralized identity management and reliable data +exchanges, thereby supporting safe and efficient autonomous vehicle operations. + +
+
+ comment: The 2024 7th IEEE Conference on Dependable and Secure Computing +
+
+
+
+
+ + ♻ ☆ Mask-Encoded Sparsification: Mitigating Biased Gradients in + Communication-Efficient Split Learning + + +
+ This paper introduces a novel framework designed to achieve a high +compression ratio in Split Learning (SL) scenarios where resource-constrained +devices are involved in large-scale model training. Our investigations +demonstrate that compressing feature maps within SL leads to biased gradients +that can negatively impact the convergence rates and diminish the +generalization capabilities of the resulting models. Our theoretical analysis +provides insights into how compression errors critically hinder SL performance, +which previous methodologies underestimate. To address these challenges, we +employ a narrow bit-width encoded mask to compensate for the sparsification +error without increasing the order of time complexity. Supported by rigorous +theoretical analysis, our framework significantly reduces compression errors +and accelerates the convergence. Extensive experiments also verify that our +method outperforms existing solutions regarding training efficiency and +communication complexity. + +
+
+
+
+
+ + ♻ ☆ CompressedMediQ: Hybrid Quantum Machine Learning Pipeline for + High-Dimensional Neuroimaging Data + + +
+ This paper introduces CompressedMediQ, a novel hybrid quantum-classical +machine learning pipeline specifically developed to address the computational +challenges associated with high-dimensional multi-class neuroimaging data +analysis. Standard neuroimaging datasets, such as 4D MRI data from the +Alzheimer's Disease Neuroimaging Initiative (ADNI) and Neuroimaging in +Frontotemporal Dementia (NIFD), present significant hurdles due to their vast +size and complexity. CompressedMediQ integrates classical high-performance +computing (HPC) nodes for advanced MRI pre-processing and Convolutional Neural +Network (CNN)-PCA-based feature extraction and reduction, addressing the +limited-qubit availability for quantum data encoding in the NISQ (Noisy +Intermediate-Scale Quantum) era. This is followed by Quantum Support Vector +Machine (QSVM) classification. By utilizing quantum kernel methods, the +pipeline optimizes feature mapping and classification, enhancing data +separability and outperforming traditional neuroimaging analysis techniques. +Experimental results highlight the pipeline's superior accuracy in dementia +staging, validating the practical use of quantum machine learning in clinical +diagnostics. Despite the limitations of NISQ devices, this proof-of-concept +demonstrates the transformative potential of quantum-enhanced learning, paving +the way for scalable and precise diagnostic tools in healthcare and signal +processing. + +
+
+
+
+
+
+
+
+ + Programming and Languages 2 + +
+
+
+ + ☆ Memory Consistency and Program Transformations + + +
+ A memory consistency model specifies the allowed behaviors of shared memory +concurrent programs. At the language level, these models are known to have a +non-trivial impact on the safety of program optimizations, limiting the ability +to rearrange/refactor code without introducing new behaviors. Existing +programming language memory models try to address this by permitting more +(relaxed/weak) concurrent behaviors but are still unable to allow all the +desired optimizations. A core problem is that weaker consistency models may +also render optimizations unsafe, a conclusion that goes against the intuition +of them allowing more behaviors. This exposes an open problem of the +compositional interaction between memory consistency semantics and +optimizations: which parts of the semantics correspond to allowing/disallowing +which set of optimizations is unclear. In this work, we establish a formal +foundation suitable enough to understand this compositional nature, decomposing +optimizations into a finite set of elementary effects on program execution +traces, over which aspects of safety can be assessed. We use this decomposition +to identify a desirable compositional property (complete) that would guarantee +the safety of optimizations from one memory model to another. We showcase its +practicality by proving such a property between Sequential Consistency (SC) and +$SC_{RR}$, the latter allowing independent read-read reordering over $SC$. Our +work potentially paves way to a new design methodology of programming-language +memory models, one that places emphasis on the optimizations desired to be +performed. + +
+
+
+
+
+ + ♻ ☆ Reactive graphs in action (extended version) + + +
+ Reactive graphs are transition structures whereas edges become active and +inactive during its evolution, that were introduced by Dov Gabbay from a +mathematical's perspective. This paper presents Marge +(https://fm-dcc.github.io/MARGe), a web-based tool to visualise and analyse +reactive graphs enriched with labels. Marge animates the operational semantics +of reactive graphs and offers different graphical views to provide insights +over concrete systems. We motivate the applicability of reactive graphs for +adaptive systems and for featured transition systems, using Marge to tighten +the gap between the existing theoretical models and their usage to analyse +concrete systems. + +
+
+ comment: Companion paper of an article accepted at FACS 2024 with a similar + name +
+
+
+
+
+
+
+
+ + Performance Profiling 1 + +
+
+
+ + ☆ HRA: A Multi-Criteria Framework for Ranking Metaheuristic Optimization + Algorithms + + +
+ Metaheuristic algorithms are essential for solving complex optimization +problems in different fields. However, the difficulty in comparing and rating +these algorithms remains due to the wide range of performance metrics and +problem dimensions usually involved. On the other hand, nonparametric +statistical methods and post hoc tests are time-consuming, especially when we +only need to identify the top performers among many algorithms. The +Hierarchical Rank Aggregation (HRA) algorithm aims to efficiently rank +metaheuristic algorithms based on their performance across many criteria and +dimensions. The HRA employs a hierarchical framework that begins with +collecting performance metrics on various benchmark functions and dimensions. +Rank-based normalization is employed for each performance measure to ensure +comparability and the robust TOPSIS aggregation is applied to combine these +rankings at several hierarchical levels, resulting in a comprehensive ranking +of the algorithms. Our study uses data from the CEC 2017 competition to +demonstrate the robustness and efficacy of the HRA framework. It examines 30 +benchmark functions and evaluates the performance of 13 metaheuristic +algorithms across five performance indicators in four distinct dimensions. This +presentation highlights the potential of the HRA to enhance the interpretation +of the comparative advantages and disadvantages of various algorithms by +simplifying practitioners' choices of the most appropriate algorithm for +certain optimization problems. + +
+
+ comment: 13 pages, 1 figure +
+
+
+
+
+
+
+
+ + Logic in Computer Science 5 + +
+
+
+ + ☆ Biological arrow of time: Emergence of tangled information hierarchies + and self-modelling dynamics + + +
+ We study open-ended evolution by focusing on computational and +information-processing dynamics underlying major evolutionary transitions. In +doing so, we consider biological organisms as hierarchical dynamical systems +that generate regularities in their phase-spaces through interactions with +their environment. These emergent information patterns can then be encoded +within the organism's components, leading to self-modelling "tangled +hierarchies". Our main conjecture is that when macro-scale patterns are encoded +within micro-scale components, it creates fundamental tensions (computational +inconsistencies) between what is encodable at a particular evolutionary stage +and what is potentially realisable in the environment. A resolution of these +tensions triggers an evolutionary transition which expands the problem-space, +at the cost of generating new tensions in the expanded space, in a continual +process. We argue that biological complexification can be interpreted +computation-theoretically, within the G\"odel--Turing--Post recursion-theoretic +framework, as open-ended generation of computational novelty. In general, this +process can be viewed as a meta-simulation performed by higher-order systems +that successively simulate the computation carried out by lower-order systems. +This computation-theoretic argument provides a basis for hypothesising the +biological arrow of time. + +
+
+ comment: 30 pages, 13 figures +
+
+
+
+
+ + ☆ On Randomized Computational Models and Complexity Classes: a Historical + Overview + + +
+ Since their appearance in the 1950s, computational models capable of +performing probabilistic choices have received wide attention and are nowadays +pervasive in almost every areas of computer science. Their development was also +inextricably linked with inquiries about computation power and resource issues. +Although most crucial notions in the field are well-known, the related +terminology is sometimes imprecise or misleading. The present work aims to +clarify the core features and main differences between machines and classes +developed in relation to randomized computation. To do so, we compare the +modern definitions with original ones, recalling the context in which they +first appeared, and investigate the relations linking probabilistic and +counting models. + +
+
+
+
+
+ + ☆ An Imperative Language for Verified Exact Real-Number Computation + + +
+ We introduce Clerical, a programming language for exact real-number +computation that combines first-order imperative-style programming with a limit +operator for computation of real numbers as limits of Cauchy sequences. We +address the semidecidability of the linear ordering of the reals by +incorporating nondeterministic guarded choice, through which decisions based on +partial comparison operations on reals can be patched together to give total +programs. The interplay between mutable state, nondeterminism, and computation +of limits is controlled by the requirement that expressions computing limits +and guards modify only local state. We devise a domain-theoretic denotational +semantics that uses a variant of Plotkin powerdomain construction tailored to +our specific version of nondeterminism. We formulate a Hoare-style +specification logic, show that it is sound for the denotational semantics, and +illustrate the setup by implementing and proving correct a program for +computation of $\pi$ as the least positive zero of $\sin$. The modular +character of Clerical allows us to compose the program from smaller parts, each +of which is shown to be correct on its own. We provide a proof-of-concept OCaml +implementation of Clerical, and formally verify parts of the development, +notably the soundness of specification logic, in the Coq proof assistant. + +
+
+
+
+
+ + ♻ ☆ Interpretable classifiers for tabular data via discretization and + feature selection + + +
+ We introduce a method for computing immediately human interpretable yet +accurate classifiers from tabular data. The classifiers obtained are short +Boolean formulas, computed via first discretizing the original data and then +using feature selection coupled with a very fast algorithm for producing the +best possible Boolean classifier for the setting. We demonstrate the approach +via 12 experiments, obtaining results with accuracies comparable to ones +obtained via random forests, XGBoost, and existing results for the same +datasets in the literature. In most cases, the accuracy of our method is in +fact similar to that of the reference methods, even though the main objective +of our study is the immediate interpretability of our classifiers. We also +prove a new result on the probability that the classifier we obtain from +real-life data corresponds to the ideally best classifier with respect to the +background distribution the data comes from. + +
+
+ comment: Preprint of a paper in DAO-XAI 2024 (Data meets Applied Ontologies in + Explainable AI) +
+
+
+
+
+ + ♻ ☆ Synthesis of Computable Regular Functions of Infinite Words + + +
+ Regular functions from infinite words to infinite words can be equivalently +specified by MSO-transducers, streaming $\omega$-string transducers as well as +deterministic two-way transducers with look-ahead. In their one-way +restriction, the latter transducers define the class of rational functions. +Even though regular functions are robustly characterised by several +finite-state devices, even the subclass of rational functions may contain +functions which are not computable (by a Turing machine with infinite input). +This paper proposes a decision procedure for the following synthesis problem: +given a regular function $f$ (equivalently specified by one of the +aforementioned transducer model), is $f$ computable and if it is, synthesize a +Turing machine computing it. + For regular functions, we show that computability is equivalent to +continuity, and therefore the problem boils down to deciding continuity. We +establish a generic characterisation of continuity for functions preserving +regular languages under inverse image (such as regular functions). We exploit +this characterisation to show the decidability of continuity (and hence +computability) of rational and regular functions. For rational functions, we +show that this can be done in $\mathsf{NLogSpace}$ (it was already known to be +in $\mathsf{PTime}$ by Prieur). In a similar fashion, we also effectively +characterise uniform continuity of regular functions, and relate it to the +notion of uniform computability, which offers stronger efficiency guarantees. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Distributed, Parallel, and Cluster Computing 14 + +
+
+
+ + ☆ Temporal Load Imbalance on Ondes3D Seismic Simulator for Different + Multicore Architectures + + +
+ The variety of today's multicore architectures motivates researchers to +explore parallel scientific applications on different platforms. Load imbalance +is one performance issue that can prejudice parallel applications from +exploiting the computational power of these platforms. Ondes3D is a scientific +application for seismic wave simulation used to assess the geological impact of +earthquakes. Its parallelism relies on applying a regular domain decomposition +in the geological domain provided and distributing each sub-domain to MPI +ranks. Previous works investigate the significant spatial and temporal +imbalance in Ondes3D and suggest new parallelization and load balancing +techniques to minimize them. However, none explored its execution on different +architectures. Our paper evaluates the performance of Ondes3D for two +earthquake scenarios on eight different multicore architectures, including +Intel, AMD, and ARM processors. We measure the load distribution per MPI rank, +evaluate the temporal load imbalance, and compare the execution of the +application's kernels. Our results show that the temporal load imbalance in +Ondes3D depends on the architecture chosen, with some platforms minimizing such +imbalance more effectively. + +
+
+ comment: The 2020 International Conference on High Performance Computing and + Simulation (HPCS 2020) +
+
+
+
+
+ + ☆ Communication Lower Bounds and Optimal Algorithms for Symmetric Matrix + Computations + + +
+ In this article, we focus on the communication costs of three symmetric +matrix computations: i) multiplying a matrix with its transpose, known as a +symmetric rank-k update (SYRK) ii) adding the result of the multiplication of a +matrix with the transpose of another matrix and the transpose of that result, +known as a symmetric rank-2k update (SYR2K) iii) performing matrix +multiplication with a symmetric input matrix (SYMM). All three computations +appear in the Level 3 Basic Linear Algebra Subroutines (BLAS) and have wide use +in applications involving symmetric matrices. We establish communication lower +bounds for these kernels using sequential and distributed-memory parallel +computational models, and we show that our bounds are tight by presenting +communication-optimal algorithms for each setting. Our lower bound proofs rely +on applying a geometric inequality for symmetric computations and analytically +solving constrained nonlinear optimization problems. The symmetric matrix and +its corresponding computations are accessed and performed according to a +triangular block partitioning scheme in the optimal algorithms. + +
+
+ comment: 43 pages, 6 figures. To be published in ACM Transactions on Parallel + Computing +
+
+
+
+
+ + ☆ Federated Learning with Integrated Sensing, Communication, and + Computation: Frameworks and Performance Analysis + + +
+ With the emergence of integrated sensing, communication, and computation +(ISCC) in the upcoming 6G era, federated learning with ISCC (FL-ISCC), +integrating sample collection, local training, and parameter exchange and +aggregation, has garnered increasing interest for enhancing training +efficiency. Currently, FL-ISCC primarily includes two algorithms: FedAVG-ISCC +and FedSGD-ISCC. However, the theoretical understanding of the performance and +advantages of these algorithms remains limited. To address this gap, we +investigate a general FL-ISCC framework, implementing both FedAVG-ISCC and +FedSGD-ISCC. We experimentally demonstrate the substantial potential of the +ISCC framework in reducing latency and energy consumption in FL. Furthermore, +we provide a theoretical analysis and comparison. The results reveal that:1) +Both sample collection and communication errors negatively impact algorithm +performance, highlighting the need for careful design to optimize FL-ISCC +applications. 2) FedAVG-ISCC performs better than FedSGD-ISCC under IID data +due to its advantage with multiple local updates. 3) FedSGD-ISCC is more robust +than FedAVG-ISCC under non-IID data, where the multiple local updates in +FedAVG-ISCC worsen performance as non-IID data increases. FedSGD-ISCC maintains +performance levels similar to IID conditions. 4) FedSGD-ISCC is more resilient +to communication errors than FedAVG-ISCC, which suffers from significant +performance degradation as communication errors increase.Extensive simulations +confirm the effectiveness of the FL-ISCC framework and validate our theoretical +analysis. + +
+
+ comment: due to the limitation The abstract field cannot be longer than 1,920 + characters", the abstract appearing here is slightly shorter than that in the + PDF file +
+
+
+
+
+ + ☆ Energy Efficiency Support for Software Defined Networks: a Serverless + Computing Approach + + +
+ Automatic network management strategies have become paramount for meeting the +needs of innovative real-time and data-intensive applications, such as in the +Internet of Things. However, meeting the ever-growing and fluctuating demands +for data and services in such applications requires more than ever an efficient +and scalable network resource management approach. Such approach should enable +the automated provisioning of services while incentivising energy-efficient +resource usage that expands throughout the edge-to-cloud continuum. This paper +is the first to realise the concept of modular Software-Defined Networks based +on serverless functions in an energy-aware environment. By adopting Function as +a Service, the approach enables on-demand deployment of network functions, +resulting in cost reduction through fine resource provisioning granularity. An +analytical model is presented to approximate the service delivery time and +power consumption, as well as an open-source prototype implementation supported +by an extensive experimental evaluation. The experiments demonstrate not only +the practical applicability of the proposed approach but significant +improvement in terms of energy efficiency. + +
+
+
+
+
+ + ☆ A Reinforcement Learning Environment for Automatic Code Optimization in + the MLIR Compiler + + +
+ Code optimization is a crucial task aimed at enhancing code performance. +However, this process is often tedious and complex, highlighting the necessity +for automatic code optimization techniques. Reinforcement Learning (RL), a +machine learning technique, has emerged as a promising approach for tackling +such complex optimization problems. In this project, we introduce the first RL +environment for the MLIR compiler, dedicated to facilitating MLIR compiler +research, and enabling automatic code optimization using Multi-Action +Reinforcement Learning. We also propose a novel formulation of the action space +as a Cartesian product of simpler action subspaces, enabling more efficient and +effective optimizations. Experimental results demonstrate that our proposed +environment allows for an effective optimization of MLIR operations, and yields +comparable performance to TensorFlow, surpassing it in multiple cases, +highlighting the potential of RL-based optimization in compiler frameworks. + +
+
+
+
+
+ + ☆ Delay Analysis of EIP-4844 + + +
+ Proto-Danksharding, proposed in Ethereum Improvement Proposal 4844 +(EIP-4844), aims to incrementally improve the scalability of the Ethereum +blockchain by introducing a new type of transaction known as blob-carrying +transactions. These transactions incorporate binary large objects (blobs) of +data that are stored off-chain but referenced and verified on-chain to ensure +data availability. By decoupling data availability from transaction execution, +Proto-Danksharding alleviates network congestion and reduces gas fees, laying +the groundwork for future, more advanced sharding solutions. This letter +provides an analytical model to derive the delay for these new transactions. We +model the system as an $\mathrm{M/D}^B/1$ queue which we then find its steady +state distribution through embedding a Markov chain and use of supplementary +variable method. We show that transactions with more blobs but less frequent +impose higher delays on the system compared to lower blobs but more frequent. + +
+
+
+
+
+ + ☆ Ladon: High-Performance Multi-BFT Consensus via Dynamic Global Ordering + (Extended Version) + + +
+ Multi-BFT consensus runs multiple leader-based consensus instances in +parallel, circumventing the leader bottleneck of a single instance. However, it +contains an Achilles' heel: the need to globally order output blocks across +instances. Deriving this global ordering is challenging because it must cope +with different rates at which blocks are produced by instances. Prior Multi-BFT +designs assign each block a global index before creation, leading to poor +performance. + We propose Ladon, a high-performance Multi-BFT protocol that allows varying +instance block rates. Our key idea is to order blocks across instances +dynamically, which eliminates blocking on slow instances. We achieve dynamic +global ordering by assigning monotonic ranks to blocks. We pipeline rank +coordination with the consensus process to reduce protocol overhead and combine +aggregate signatures with rank information to reduce message complexity. +Ladon's dynamic ordering enables blocks to be globally ordered according to +their generation, which respects inter-block causality. We implemented and +evaluated Ladon by integrating it with both PBFT and HotStuff protocols. Our +evaluation shows that Ladon-PBFT (resp., Ladon-HotStuff) improves the peak +throughput of the prior art by $\approx$8x (resp., 2x) and reduces latency by +$\approx$62% (resp., 23%), when deployed with one straggling replica (out of +128 replicas) in a WAN setting. + +
+
+
+
+
+ + ☆ Skip TLB flushes for reused pages within mmap's + + +
+ Memory access efficiency is significantly enhanced by caching recent address +translations in the CPUs' Translation Lookaside Buffers (TLBs). However, since +the operating system is not aware of which core is using a particular mapping, +it flushes TLB entries across all cores where the application runs whenever +addresses are unmapped, ensuring security and consistency. These TLB flushes, +known as TLB shootdowns, are costly and create a performance and scalability +bottleneck. A key contributor to TLB shootdowns is memory-mapped I/O, +particularly during mmap-munmap cycles and page cache evictions. Often, the +same physical pages are reassigned to the same process post-eviction, +presenting an opportunity for the operating system to reduce the frequency of +TLB shootdowns. We demonstrate, that by slightly extending the mmap function, +TLB shootdowns for these "recycled pages" can be avoided. + Therefore we introduce and implement the "fast page recycling" (FPR) feature +within the mmap system call. FPR-mmaps maintain security by only triggering TLB +shootdowns when a page exits its recycling cycle and is allocated to a +different process. To ensure consistency when FPR-mmap pointers are used, we +made minor adjustments to virtual memory management to avoid the ABA problem. +Unlike previous methods to mitigate shootdown effects, our approach does not +require any hardware modifications and operates transparently within the +existing Linux virtual memory framework. + Our evaluations across a variety of CPU, memory, and storage setups, +including persistent memory and Optane SSDs, demonstrate that FPR delivers +notable performance gains, with improvements of up to 28% in real-world +applications and 92% in micro-benchmarks. Additionally, we show that TLB +shootdowns are a significant source of bottlenecks, previously misattributed to +other components of the Linux kernel. + +
+
+
+
+
+ + ☆ Dynamic DAG-Application Scheduling for Multi-Tier Edge Computing in + Heterogeneous Networks + + +
+ Edge computing is deemed a promising technique to execute latency-sensitive +applications by offloading computation-intensive tasks to edge servers. +Extensive research has been conducted in the field of end-device to edge server +task offloading for several goals, including latency minimization, energy +optimization, and resource optimization. However, few of them consider our +mobile computing devices (smartphones, tablets, and laptops) to be edge +devices. In this paper, we propose a novel multi-tier edge computing framework, +which we refer to as M-TEC, that aims to optimize latency, reduce the +probability of failure, and optimize cost while accounting for the sporadic +failure of personally owned devices and the changing network conditions. We +conduct experiments with a real testbed and a real commercial CBRS 4G network, +and the results indicate that M-TEC is capable of reducing the end-to-end +latency of applications by at least 8\% compared to the best baseline under a +variety of network conditions, while providing reliable performance at an +affordable cost. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ CountChain: A Decentralized Oracle Network for Counting Systems + + +
+ Blockchain integration in industries like online advertising is hindered by +its connectivity limitations to off-chain data. These industries heavily rely +on precise counting systems for collecting and analyzing off-chain data. This +requires mechanisms, often called oracles, to feed off-chain data into smart +contracts. However, current oracle solutions are ill-suited for counting +systems since the oracles do not know when to expect the data, posing a +significant challenge. + To address this, we present CountChain, a decentralized oracle network for +counting systems. In CountChain, data is received by all oracle nodes, and any +node can submit a proposition request. Each proposition contains enough data to +evaluate the occurrence of an event. Only randomly selected nodes participate +in a game to evaluate the truthfulness of each proposition by providing proof +and some stake. Finally, the propositions with the outcome of True increment +the counter in a smart contract. Thus, instead of a contract calling oracles +for data, in CountChain, the oracles call a smart contract when the data is +available. Furthermore, we present a formal analysis and experimental +evaluation of the system's parameters on over half a million data points to +obtain optimal system parameters. In such conditions, our game-theoretical +analysis demonstrates that a Nash equilibrium exists wherein all rational +parties participate with honesty. + +
+
+ comment: being published at https://ieee-cybermatics.org/2024/blockchain/ +
+
+
+
+
+ + ☆ Advances in APPFL: A Comprehensive and Extensible Federated Learning + Framework + + +
+ Federated learning (FL) is a distributed machine learning paradigm enabling +collaborative model training while preserving data privacy. In today's +landscape, where most data is proprietary, confidential, and distributed, FL +has become a promising approach to leverage such data effectively, particularly +in sensitive domains such as medicine and the electric grid. Heterogeneity and +security are the key challenges in FL, however; most existing FL frameworks +either fail to address these challenges adequately or lack the flexibility to +incorporate new solutions. To this end, we present the recent advances in +developing APPFL, an extensible framework and benchmarking suite for federated +learning, which offers comprehensive solutions for heterogeneity and security +concerns, as well as user-friendly interfaces for integrating new algorithms or +adapting to new applications. We demonstrate the capabilities of APPFL through +extensive experiments evaluating various aspects of FL, including communication +efficiency, privacy preservation, computational performance, and resource +utilization. We further highlight the extensibility of APPFL through case +studies in vertical, hierarchical, and decentralized FL. APPFL is open-sourced +at https://github.com/APPFL/APPFL. + +
+
+
+
+
+ + ♻ ☆ ACC Saturator: Automatic Kernel Optimization for Directive-Based GPU + Code + + +
+ Automatic code optimization is a complex process that typically involves the +application of multiple discrete algorithms that modify the program structure +irreversibly. However, the design of these algorithms is often monolithic, and +they require repetitive implementation to perform similar analyses due to the +lack of cooperation. To address this issue, modern optimization techniques, +such as equality saturation, allow for exhaustive term rewriting at various +levels of inputs, thereby simplifying compiler design. + In this paper, we propose equality saturation to optimize sequential codes +utilized in directive-based programming for GPUs. Our approach realizes less +computation, less memory access, and high memory throughput simultaneously. Our +fully-automated framework constructs single-assignment forms from inputs to be +entirely rewritten while keeping dependencies and extracts optimal cases. +Through practical benchmarks, we demonstrate a significant performance +improvement on several compilers. Furthermore, we highlight the advantages of +computational reordering and emphasize the significance of memory-access order +for modern GPUs. + +
+
+ comment: To appear in: Proceedings of Eleventh Workshop on Accelerator + Programming and Directives (WACCPD 2024) +
+
+
+
+
+ + ♻ ☆ Context Adaptive Cooperation + + +
+ As shown by Reliable Broadcast and Consensus, cooperation among a set of +independent computing entities (sequential processes) is a central issue in +distributed computing. Considering $n$-process asynchronous message-passing +systems where some processes can be Byzantine, this paper introduces a new +cooperation abstraction denoted Context-Adaptive Cooperation (CAC). While +Reliable Broadcast is a one-to-$n$ cooperation abstraction and Consensus is an +$n$-to-$n$ cooperation abstraction, CAC is a $d$-to-$n$ cooperation abstraction +where the parameter $d$ ($1\leq d\leq n$) depends on the run and remains +unknown to the processes. Moreover, the correct processes accept the same set +of $\ell$ pairs $\langle v,i\rangle$ ($v$ is the value proposed by $p_i$) from +the $d$ proposer processes, where $1 \leq \ell \leq d$ and, as $d$, $\ell$ +remains unknown to the processes (except in specific cases). Those $\ell$ +values are accepted one at a time in different orders at each process. +Furthermore, CAC provides the processes with an imperfect oracle that gives +information about the values that they may accept in the future. In a very +interesting way, the CAC abstraction is particularly efficient in favorable +circumstances. To illustrate its practical use, the paper describes in detail +two applications that benefit from the abstraction: a fast consensus +implementation under low contention (named Cascading Consensus), and a novel +naming problem. + +
+
+
+
+
+ + ♻ ☆ Navigating High-Degree Heterogeneity: Federated Learning in Aerial and + Space Networks + + +
+ Federated learning offers a compelling solution to the challenges of +networking and data privacy within aerial and space networks by utilizing vast +private edge data and computing capabilities accessible through drones, +balloons, and satellites. While current research has focused on optimizing the +learning process, computing efficiency, and minimizing communication overhead, +the heterogeneity issue and class imbalance remain a significant barrier to +rapid model convergence. In this paper, we explore the influence of +heterogeneity on class imbalance, which diminishes performance in Aerial and +Space Networks (ASNs)-based federated learning. We illustrate the correlation +between heterogeneity and class imbalance within grouped data and show how +constraints such as battery life exacerbate the class imbalance challenge. Our +findings indicate that ASNs-based FL faces heightened class imbalance issues +even with similar levels of heterogeneity compared to other scenarios. Finally, +we analyze the impact of varying degrees of heterogeneity on FL training and +evaluate the efficacy of current state-of-the-art algorithms under these +conditions. Our results reveal that the heterogeneity challenge is more +pronounced in ASNs-based federated learning and that prevailing algorithms +often fail to effectively address high levels of heterogeneity. + +
+
+ comment: Accepted by IEEE 10th World Forum on Internet of Things +
+
+
+
+
+
+
+
+ + Programming and Languages 10 + +
+
+
+ + ☆ The Incredible Shrinking Context... in a decompiler near you + + +
+ Decompilation of binary code has arisen as a highly-important application in +the space of Ethereum VM (EVM) smart contracts. Major new decompilers appear +nearly every year and attain popularity, for a multitude of reverse-engineering +or tool-building purposes. Technically, the problem is fundamental: it consists +of recovering high-level control flow from a highly-optimized +continuation-passing-style (CPS) representation. Architecturally, decompilers +can be built using either static analysis or symbolic execution techniques. + We present Shrknr, a static-analysis-based decompiler succeeding the +state-of-the-art Elipmoc decompiler. Shrknr manages to achieve drastic +improvements relative to the state of the art, in all significant dimensions: +scalability, completeness, precision. Chief among the techniques employed is a +new variant of static analysis context: shrinking context sensitivity. +Shrinking context sensitivity performs deep cuts in the static analysis +context, eagerly "forgetting" control-flow history, in order to leave room for +further precise reasoning. + We compare Shrnkr to state-of-the-art decompilers, both static-analysis- and +symbolic-execution-based. In a standard benchmark set, Shrnkr scales to over +99.5% of contracts (compared to ~95%), covers (i.e., reaches and manages to +decompile) 67% more code, and reduces key imprecision metrics by over 65%. + +
+
+
+
+
+ + ☆ Towards Quantum Multiparty Session Types + + +
+ Multiparty Session Types (MPSTs) offer a structured way of specifying +communication protocols and guarantee relevant communication properties, such +as deadlock-freedom. In this paper, we extend a minimal MPST system with +quantum data and operations, enabling the specification of quantum protocols. +Quantum MPSTs (QMPSTs) provide a formal notation to describe quantum protocols, +both at the abstract level of global types, describing which communications can +take place in the system and their dependencies, and at the concrete level of +local types and quantum processes, describing the expected behavior of each +participant in the protocol. Type-checking relates these two levels formally, +ensuring that processes behave as prescribed by the global type. Beyond usual +communication properties, QMPSTs also allow us to prove that qubits are owned +by a single process at any time, capturing the quantum no-cloning and +no-deleting theorems. We use our approach to verify four quantum protocols from +the literature, respectively Teleportation, Secret Sharing, Bit-Commitment, and +Key Distribution. + +
+
+ comment: To appear at SEFM 2024 +
+
+
+
+
+ + ☆ Scheme Pearl: Quantum Continuations + + +
+ We advance the thesis that the simulation of quantum circuits is +fundamentally about the efficient management of a large (potentially +exponential) number of delimited continuations. The family of Scheme languages, +with its efficient implementations of first-class continuations and with its +imperative constructs, provides an elegant host for modeling and simulating +quantum circuits. + +
+
+ comment: Appeared at Scheme Workshop 2022 +
+
+
+
+
+ + ☆ Introducing Quantification into a Hierarchical Graph Rewriting Language + + +
+ LMNtal is a programming and modeling language based on hierarchical graph +rewriting that uses logical variables to represent connectivity and membranes +to represent hierarchy. On the theoretical side, it allows logical +interpretation based on intuitionistic linear logic; on the practical side, its +full-fledged implementation supports a graph-based parallel model checker and +has been used to model diverse applications including various computational +models. This paper discuss how we extend LMNtal to QLMNtal (LMNtal with +Quantification) to further enhance the usefulness of hierarchical graph +rewriting for high-level modeling by introducing quantifiers into rewriting as +well as matching. Those quantifiers allows us to express universal +quantification, cardinality and non-existence in an integrated manner. Unlike +other attempts to introduce quantifiers into graph rewriting, QLMNtal has +term-based syntax, whose semantics is smoothly integrated into the small-step +semantics of the base language LMNtal. The proposed constructs allow combined +and nested use of quantifiers within individual rewrite rules. + +
+
+ comment: Extended version (with Appendix) of the paper presented at the 34th + International Symposium on Logic-Based Program Synthesis and Transformation + (LOPSTR 2024), Milano, Italy, September 2024, LNCS 14919, Springer-Verlag, + pp.220-239. 26 pages +
+
+
+
+
+ + ☆ No Saved Kaleidosope: an 100% Jitted Neural Network Coding Language with + Pythonic Syntax + + +
+ We developed a jitted compiler for training Artificial Neural Networks using +C++, LLVM and Cuda. It features object-oriented characteristics, strong typing, +parallel workers for data pre-processing, pythonic syntax for expressions, +PyTorch like model declaration and Automatic Differentiation. We implement the +mechanisms of cache and pooling in order to manage VRAM, cuBLAS for high +performance matrix multiplication and cuDNN for convolutional layers. Our +experiments with Residual Convolutional Neural Networks on ImageNet, we reach +similar speed but degraded performance. Also, the GRU network experiments show +similar accuracy, but our compiler have degraded speed in that task. However, +our compiler demonstrates promising results at the CIFAR-10 benchmark, in which +we reach the same performance and about the same speed as PyTorch. We make the +code publicly available at: https://github.com/NoSavedDATA/NoSavedKaleidoscope + +
+
+ comment: 12 pages, 3 figures and 3 tables +
+
+
+
+
+ + ☆ Minuska: Towards a Formally Verified Programming Language Framework + + +
+ Programming language frameworks allow us to generate language tools (e.g., +interpreters) just from a formal description of the syntax and semantics of a +programming language. As these frameworks tend to be quite complex, an issue +arises whether we can trust the generated tools. To address this issue, we +introduce a practical formal programming language framework called Minuska, +which always generates a provably correct interpreter given a valid language +definition. This is achieved by (1) defining a language MinusLang for +expressing programming language definitions and giving it formal semantics and +(2) using the Coq proof assistant to implement an interpreter parametric in a +MinusLang definition and to prove it correct. Minuska provides strong +correctness guarantees and can support nontrivial languages while performing +well. This is the extended version of the SEFM24 paper of the same name. + +
+
+
+
+
+ + ♻ ☆ Strengthening Solidity Invariant Generation: From Post- to + Pre-Deployment + + +
+ Invariants are essential for ensuring the security and correctness of +Solidity smart contracts, particularly in the context of blockchain's +immutability and decentralized execution. This paper introduces InvSol, a novel +framework for pre-deployment invariant generation tailored specifically for +Solidity smart contracts. Unlike existing solutions, namely InvCon, InvCon+, +and Trace2Inv, that rely on post-deployment transaction histories on Ethereum +mainnet, InvSol identifies invariants before deployment and offers +comprehensive coverage of Solidity language constructs, including loops. +Additionally, InvSol incorporates custom templates to effectively prevent +critical issues such as reentrancy, out-of-gas errors, and exceptions during +invariant generation. We rigorously evaluate InvSol using a benchmark set of +smart contracts and compare its performance with state-of-the-art solutions. +Our findings reveal that InvSol significantly outperforms these tools, +demonstrating its effectiveness in handling new contracts with limited +transaction histories. Notably, InvSol achieves a 15% improvement in +identifying common vulnerabilities compared to InvCon+ and is able to address +certain crucial vulnerabilities using specific invariant templates, better than +Trace2Inv. + +
+
+
+
+
+ + ♻ ☆ Disentangling Parallelism and Interference in Game Semantics + + +
+ Game semantics is a denotational semantics presenting compositionally the +computational behaviour of various kinds of effectful programs. One of its +celebrated achievement is to have obtained full abstraction results for +programming languages with a variety of computational effects, in a single +framework. This is known as the semantic cube or Abramsky's cube, which for +sequential deterministic programs establishes a correspondence between certain +conditions on strategies (''innocence'', ''well-bracketing'', ''visibility'') +and the absence of matching computational effects. Outside of the sequential +deterministic realm, there are still a wealth of game semantics-based full +abstraction results; but they no longer fit in a unified canvas. In particular, +Ghica and Murawski's fully abstract model for shared state concurrency (IA) +does not have a matching notion of pure parallel program-we say that +parallelism and interference (i.e. state plus semaphores) are entangled. In +this paper we construct a causal version of Ghica and Murawski's model, also +fully abstract for IA. We provide compositional conditions parallel innocence +and sequentiality, respectively banning interference and parallelism, and +leading to four full abstraction results. To our knowledge, this is the first +extension of Abramsky's semantic cube programme beyond the sequential +deterministic world. + +
+
+
+
+
+ + ♻ ☆ Validating Traces of Distributed Programs Against TLA+ Specifications + + +
+ TLA+ is a formal language for specifying systems, including distributed +algorithms, that is supported by powerful verification tools. In this work we +present a framework for relating traces of distributed programs to high-level +specifications written in TLA+. The problem is reduced to a constrained model +checking problem, realized using the TLC model checker. Our framework consists +of an API for instrumenting Java programs in order to record traces of +executions, of a collection of TLA+ operators that are used for relating those +traces to specifications, and of scripts for running the model checker. +Crucially, traces only contain updates to specification variables rather than +full values, and developers may choose to trace only certain variables. We have +applied our approach to several distributed programs, detecting discrepancies +between the specifications and the implementations in all cases. We discuss +reasons for these discrepancies, best practices for instrumenting programs, and +how to interpret the verdict produced by TLC. + +
+
+
+
+
+ + ♻ ☆ MonoCoder: Domain-Specific Code Language Model for HPC Codes and Tasks + + +
+ With easier access to powerful compute resources, there is a growing trend in +AI for software development to develop large language models (LLMs) to address +a variety of programming tasks. Even LLMs applied to tasks from the +high-performance computing (HPC) domain are huge in size and demand expensive +compute resources for training. This is partly because LLMs for HPC tasks are +obtained by finetuning existing LLMs that support several natural and/or +programming languages. We found this design choice confusing - why do we need +LLMs trained on natural languages and programming languages unrelated to HPC +for HPC-specific tasks? In this line of work, we aim to question choices made +by existing LLMs by developing smaller language models (LMs) for specific +domains - we call them domain-specific LMs. Specifically, we start with HPC as +a domain and build an HPC-specific LM, named MonoCoder, which is orders of +magnitude smaller than existing LMs but delivers better performance on non-HPC +and HPC codes. Specifically, we pre-trained MonoCoder on an HPC-specific +dataset (named HPCorpus) of C and C++ programs mined from GitHub. We evaluated +the performance of MonoCoder against state-of-the-art multi-lingual LLMs. +Results demonstrate that MonoCoder, although much smaller than existing LMs, +outperforms other LLMs on normalized-perplexity tests (in relation to model +size) while also delivering competing CodeBLEU scores for high-performance and +parallel code generations. In other words, results suggest that MonoCoder +understands HPC code better than state-of-the-art LLMs. + +
+
+
+
+
+
+
+
+ + Logic in Computer Science 5 + +
+
+
+ + ☆ Computation and Complexity of Preference Inference Based on Hierarchical + Models + + +
+ Preference Inference involves inferring additional user preferences from +elicited or observed preferences, based on assumptions regarding the form of +the user's preference relation. In this paper we consider a situation in which +alternatives have an associated vector of costs, each component corresponding +to a different criterion, and are compared using a kind of lexicographic order, +similar to the way alternatives are compared in a Hierarchical Constraint Logic +Programming model. It is assumed that the user has some (unknown) importance +ordering on criteria, and that to compare two alternatives, firstly, the +combined cost of each alternative with respect to the most important criteria +are compared; only if these combined costs are equal, are the next most +important criteria considered. The preference inference problem then consists +of determining whether a preference statement can be inferred from a set of +input preferences. We show that this problem is coNP-complete, even if one +restricts the cardinality of the equal-importance sets to have at most two +elements, and one only considers non-strict preferences. However, it is +polynomial if it is assumed that the user's ordering of criteria is a total +ordering; it is also polynomial if the sets of equally important criteria are +all equivalence classes of a given fixed equivalence relation. We give an +efficient polynomial algorithm for these cases, which also throws light on the +structure of the inference. + +
+
+ comment: Longer Version of IJCAI'15 publication + https://www.ijcai.org/Proceedings/15/Papers/461.pdf +
+
+
+
+
+ + ☆ Resource approximation for the $λμ$-calculus + + +
+ The $\lambda\mu$-calculus plays a central role in the theory of programming +languages as it extends the Curry-Howard correspondence to classical logic. A +major drawback is that it does not satisfy B\"ohm's Theorem and it lacks the +corresponding notion of approximation. On the contrary, we show that Ehrhard +and Regnier's Taylor expansion can be easily adapted, thus providing a resource +conscious approximation theory. This produces a sensible $\lambda\mu$-theory +with which we prove some advanced properties of the $\lambda\mu$-calculus, such +as Stability and Perpendicular Lines Property, from which the impossibility of +parallel computations follows. + +
+
+
+
+
+ + ☆ Stability Property for the Call-by-Value $λ$-calculus through + Taylor Expansion + + +
+ We prove the Stability Property for the call-by-value $\lambda$-calculus (CbV +in the following). This result states necessary conditions under which the +contexts of the CbV $\lambda$-calculus commute with intersections of +approximants. This is an important non-trivial result, which implies the +sequentiality of the calculus. We prove it via the tool of Taylor-resource +approximation, whose power has been shown in several recent papers. This +technique is usually conceived for the ordinary $\lambda$-calculus, but it can +be easily defined for the CbV setting. Our proof is the adaptation of the one +for the ordinary calculus using the same technique, with some minimal technical +modification due to the fact that in the CbV setting one linearises terms in a +slightly different way than usual (cfr. $!(A\multimap B)$ vs $!A\multimap B$). +The content of this article is taken from the PhD thesis of the author. + +
+
+
+
+
+ + ☆ Denotational semantics driven simplicial homology? + + +
+ We look at the proofs of a fragment of Linear Logic as a whole: in fact, +Linear Logic's coherent semantics interprets the proofs of a given formula $A$ +as faces of an abstract simplicial complex, thus allowing us to see the set of +the (interpretations of the) proofs of $A$ as a geometrical space, not just a +set. This point of view has never been really investigated. For a ``webbed'' +denotational semantics -- say the relational one --, it suffices to down-close +the set of (the interpretations of the) proofs of $A$ in order to give rise to +an abstract simplicial complex whose faces do correspond to proofs of $A$. +Since this space comes triangulated by construction, a natural geometrical +property to consider is its homology. However, we immediately stumble on a +problem: if we want the homology to be invariant w.r.t. to some notion of +type-isomorphism, we are naturally led to consider the homology functor acting, +at the level of morphisms, on ``simplicial relations'' rather than simplicial +maps as one does in topology. The task of defining the homology functor on this +modified category can be achieved by considering a very simple monad, which is +almost the same as the power-set monad; but, doing so, we end up considering +not anymore the homology of the original space, but rather of its +transformation under the action of the monad. Does this transformation keep the +homology invariant ? Is this transformation meaningful from a geometrical or +logical/computational point of view ? + +
+
+
+
+
+ + ♻ ☆ Disentangling Parallelism and Interference in Game Semantics + + +
+ Game semantics is a denotational semantics presenting compositionally the +computational behaviour of various kinds of effectful programs. One of its +celebrated achievement is to have obtained full abstraction results for +programming languages with a variety of computational effects, in a single +framework. This is known as the semantic cube or Abramsky's cube, which for +sequential deterministic programs establishes a correspondence between certain +conditions on strategies (''innocence'', ''well-bracketing'', ''visibility'') +and the absence of matching computational effects. Outside of the sequential +deterministic realm, there are still a wealth of game semantics-based full +abstraction results; but they no longer fit in a unified canvas. In particular, +Ghica and Murawski's fully abstract model for shared state concurrency (IA) +does not have a matching notion of pure parallel program-we say that +parallelism and interference (i.e. state plus semaphores) are entangled. In +this paper we construct a causal version of Ghica and Murawski's model, also +fully abstract for IA. We provide compositional conditions parallel innocence +and sequentiality, respectively banning interference and parallelism, and +leading to four full abstraction results. To our knowledge, this is the first +extension of Abramsky's semantic cube programme beyond the sequential +deterministic world. + +
+
+
+
+
+
+
+
+ + Hardware Architecturea 2 + +
+
+
+ + ☆ eBPF-mm: Userspace-guided memory management in Linux with eBPF MICRO'24 + + +
+ We leverage eBPF in order to implement custom policies in the Linux memory +subsystem. Inspired by CBMM, we create a mechanism that provides the kernel +with hints regarding the benefit of promoting a page to a specific size. We +introduce a new hook point in Linux page fault handling path for eBPF programs, +providing them the necessary context to determine the page size to be used. We +then develop a framework that allows users to define profiles for their +applications and load them into the kernel. A profile consists of memory +regions of interest and their expected benefit from being backed by 4KB, 64KB +and 2MB pages. In our evaluation, we profiled our workloads to identify hot +memory regions using DAMON. + +
+
+ comment: ACM SRC@MICRO'24 +
+
+
+
+
+ + ☆ FSL-HDnn: A 5.7 TOPS/W End-to-end Few-shot Learning Classifier + Accelerator with Feature Extraction and Hyperdimensional Computing + + +
+ This paper introduces FSL-HDnn, an energy-efficient accelerator that +implements the end-to-end pipeline of feature extraction, classification, and +on-chip few-shot learning (FSL) through gradient-free learning techniques in a +40 nm CMOS process. At its core, FSL-HDnn integrates two low-power modules: +Weight clustering feature extractor and Hyperdimensional Computing (HDC). +Feature extractor utilizes advanced weight clustering and pattern reuse +strategies for optimized CNN-based feature extraction. Meanwhile, HDC emerges +as a novel approach for lightweight FSL classifier, employing hyperdimensional +vectors to improve training accuracy significantly compared to traditional +distance-based approaches. This dual-module synergy not only simplifies the +learning process by eliminating the need for complex gradients but also +dramatically enhances energy efficiency and performance. Specifically, FSL-HDnn +achieves an Intensity unprecedented energy efficiency of 5.7 TOPS/W for feature +1 extraction and 0.78 TOPS/W for classification and learning Training Intensity +phases, achieving improvements of 2.6X and 6.6X, respectively, Storage over +current state-of-the-art CNN and FSL processors. + +
+
+ comment: 4 pages, 12 figures, ESSERC 2024 +
+
+
+
+
+
+
+
+ + Performance Profiling 3 + +
+
+
+ + ☆ Temporal Load Imbalance on Ondes3D Seismic Simulator for Different + Multicore Architectures + + +
+ The variety of today's multicore architectures motivates researchers to +explore parallel scientific applications on different platforms. Load imbalance +is one performance issue that can prejudice parallel applications from +exploiting the computational power of these platforms. Ondes3D is a scientific +application for seismic wave simulation used to assess the geological impact of +earthquakes. Its parallelism relies on applying a regular domain decomposition +in the geological domain provided and distributing each sub-domain to MPI +ranks. Previous works investigate the significant spatial and temporal +imbalance in Ondes3D and suggest new parallelization and load balancing +techniques to minimize them. However, none explored its execution on different +architectures. Our paper evaluates the performance of Ondes3D for two +earthquake scenarios on eight different multicore architectures, including +Intel, AMD, and ARM processors. We measure the load distribution per MPI rank, +evaluate the temporal load imbalance, and compare the execution of the +application's kernels. Our results show that the temporal load imbalance in +Ondes3D depends on the architecture chosen, with some platforms minimizing such +imbalance more effectively. + +
+
+ comment: The 2020 International Conference on High Performance Computing and + Simulation (HPCS 2020) +
+
+
+
+
+ + ☆ Can Graph Reordering Speed Up Graph Neural Network Training? An + Experimental Study + + +
+ Graph neural networks (GNNs) are a type of neural network capable of learning +on graph-structured data. However, training GNNs on large-scale graphs is +challenging due to iterative aggregations of high-dimensional features from +neighboring vertices within sparse graph structures combined with neural +network operations. The sparsity of graphs frequently results in suboptimal +memory access patterns and longer training time. Graph reordering is an +optimization strategy aiming to improve the graph data layout. It has shown to +be effective to speed up graph analytics workloads, but its effect on the +performance of GNN training has not been investigated yet. The generalization +of reordering to GNN performance is nontrivial, as multiple aspects must be +considered: GNN hyper-parameters such as the number of layers, the number of +hidden dimensions, and the feature size used in the GNN model, neural network +operations, large intermediate vertex states, and GPU acceleration. + In our work, we close this gap by performing an empirical evaluation of 12 +reordering strategies in two state-of-the-art GNN systems, PyTorch Geometric +and Deep Graph Library. Our results show that graph reordering is effective in +reducing training time for CPU- and GPU-based training, respectively. Further, +we find that GNN hyper-parameters influence the effectiveness of reordering, +that reordering metrics play an important role in selecting a reordering +strategy, that lightweight reordering performs better for GPU-based than for +CPU-based training, and that invested reordering time can in many cases be +amortized. + +
+
+ comment: To be published in proceedings of the 51st International Conference + on Very Large Data Bases (VLDB), September 1-5, 2025 +
+
+
+
+
+ + ♻ ☆ Integrating ytopt and libEnsemble to Autotune OpenMC + + +
+ ytopt is a Python machine-learning-based autotuning software package +developed within the ECP PROTEAS-TUNE project. The ytopt software adopts an +asynchronous search framework that consists of sampling a small number of input +parameter configurations and progressively fitting a surrogate model over the +input-output space until exhausting the user-defined maximum number of +evaluations or the wall-clock time. libEnsemble is a Python toolkit for +coordinating workflows of asynchronous and dynamic ensembles of calculations +across massively parallel resources developed within the ECP PETSc/TAO project. +libEnsemble helps users take advantage of massively parallel resources to solve +design, decision, and inference problems and expands the class of problems that +can benefit from increased parallelism. In this paper we present our +methodology and framework to integrate ytopt and libEnsemble to take advantage +of massively parallel resources to accelerate the autotuning process. +Specifically, we focus on using the proposed framework to autotune the ECP +ExaSMR application OpenMC, an open source Monte Carlo particle transport code. +OpenMC has seven tunable parameters some of which have large ranges such as the +number of particles in-flight, which is in the range of 100,000 to 8 million, +with its default setting of 1 million. Setting the proper combination of these +parameter values to achieve the best performance is extremely time-consuming. +Therefore, we apply the proposed framework to autotune the MPI/OpenMP offload +version of OpenMC based on a user-defined metric such as the figure of merit +(FoM) (particles/s) or energy efficiency energy-delay product (EDP) on Crusher +at Oak Ridge Leadership Computing Facility. The experimental results show that +we achieve improvement up to 29.49\% in FoM and up to 30.44\% in EDP. + +
+
+
+
+
+
+
+
+ + Operation Systems 3 + +
+
+
+ + ☆ Analysis of Synchronization Mechanisms in Operating Systems + + +
+ This research analyzed the performance and consistency of four +synchronization mechanisms-reentrant locks, semaphores, synchronized methods, +and synchronized blocks-across three operating systems: macOS, Windows, and +Linux. Synchronization ensures that concurrent processes or threads access +shared resources safely, and efficient synchronization is vital for maintaining +system performance and reliability. The study aimed to identify the +synchronization mechanism that balances efficiency, measured by execution time, +and consistency, assessed by variance and standard deviation, across platforms. +The initial hypothesis proposed that mutex-based mechanisms, specifically +synchronized methods and blocks, would be the most efficient due to their +simplicity. However, empirical results showed that reentrant locks had the +lowest average execution time (14.67ms), making them the most efficient +mechanism, but with the highest variability (standard deviation of 1.15). In +contrast, synchronized methods, blocks, and semaphores exhibited higher average +execution times (16.33ms for methods and 16.67ms for blocks) but with greater +consistency (variance of 0.33). The findings indicated that while reentrant +locks were faster, they were more platform-dependent, whereas mutex-based +mechanisms provided more predictable performance across all operating systems. +The use of virtual machines for Windows and Linux was a limitation, potentially +affecting the results. Future research should include native testing and +explore additional synchronization mechanisms and higher concurrency levels. +These insights help developers and system designers optimize synchronization +strategies for either performance or stability, depending on the application's +requirements. + +
+
+ comment: This paper was submitted to the 2nd International Conference on + Computer Science and Software Engineering (CSSE 2024). It contains 19 pages +
+
+
+
+
+ + ☆ eBPF-mm: Userspace-guided memory management in Linux with eBPF MICRO'24 + + +
+ We leverage eBPF in order to implement custom policies in the Linux memory +subsystem. Inspired by CBMM, we create a mechanism that provides the kernel +with hints regarding the benefit of promoting a page to a specific size. We +introduce a new hook point in Linux page fault handling path for eBPF programs, +providing them the necessary context to determine the page size to be used. We +then develop a framework that allows users to define profiles for their +applications and load them into the kernel. A profile consists of memory +regions of interest and their expected benefit from being backed by 4KB, 64KB +and 2MB pages. In our evaluation, we profiled our workloads to identify hot +memory regions using DAMON. + +
+
+ comment: ACM SRC@MICRO'24 +
+
+
+
+
+ + ☆ Skip TLB flushes for reused pages within mmap's + + +
+ Memory access efficiency is significantly enhanced by caching recent address +translations in the CPUs' Translation Lookaside Buffers (TLBs). However, since +the operating system is not aware of which core is using a particular mapping, +it flushes TLB entries across all cores where the application runs whenever +addresses are unmapped, ensuring security and consistency. These TLB flushes, +known as TLB shootdowns, are costly and create a performance and scalability +bottleneck. A key contributor to TLB shootdowns is memory-mapped I/O, +particularly during mmap-munmap cycles and page cache evictions. Often, the +same physical pages are reassigned to the same process post-eviction, +presenting an opportunity for the operating system to reduce the frequency of +TLB shootdowns. We demonstrate, that by slightly extending the mmap function, +TLB shootdowns for these "recycled pages" can be avoided. + Therefore we introduce and implement the "fast page recycling" (FPR) feature +within the mmap system call. FPR-mmaps maintain security by only triggering TLB +shootdowns when a page exits its recycling cycle and is allocated to a +different process. To ensure consistency when FPR-mmap pointers are used, we +made minor adjustments to virtual memory management to avoid the ABA problem. +Unlike previous methods to mitigate shootdown effects, our approach does not +require any hardware modifications and operates transparently within the +existing Linux virtual memory framework. + Our evaluations across a variety of CPU, memory, and storage setups, +including persistent memory and Optane SSDs, demonstrate that FPR delivers +notable performance gains, with improvements of up to 28% in real-world +applications and 92% in micro-benchmarks. Additionally, we show that TLB +shootdowns are a significant source of bottlenecks, previously misattributed to +other components of the Linux kernel. + +
+
+
+
+
+
+
+
+ + Computational Complexity 3 + +
+
+
+ + ☆ The Complexity of Maximizing the MST-ratio + + +
+ Given a finite set of red and blue points in $\mathbb{R}^d$, the MST-ratio is +the combined length of the Euclidean minimum spanning trees of red points and +of blue points divided by the length of the Euclidean minimum spanning tree of +the union of them. The maximum MST-ratio of a point set is the maximum +MST-ratio over all non-trivial colorings of its points by red and blue. We +prove that the problem of finding the maximum MST-ratio of a given point set is +NP-hard when the dimension is a part of the input. Moreover, we present a +$O(n^2)$ running time $3$-approximation algorithm for it. As a part of the +proof, we show that in any metric space, the maximum MST-ratio is smaller than +$3$. Additionally, we study the average MST-ratio over all colorings of a set +of $n$ points. We show that this average is always at least $\frac{n-2}{n-1}$, +and for $n$ random points uniformly distributed in a $d$-dimensional unit cube, +the average tends to $\sqrt[d]{2}$ in expectation as $n$ goes to infinity. + +
+
+
+
+
+ + ☆ The Sample Complexity of Smooth Boosting and the Tightness of the + Hardcore Theorem + + +
+ Smooth boosters generate distributions that do not place too much weight on +any given example. Originally introduced for their noise-tolerant properties, +such boosters have also found applications in differential privacy, +reproducibility, and quantum learning theory. We study and settle the sample +complexity of smooth boosting: we exhibit a class that can be weak learned to +$\gamma$-advantage over smooth distributions with $m$ samples, for which strong +learning over the uniform distribution requires +$\tilde{\Omega}(1/\gamma^2)\cdot m$ samples. This matches the overhead of +existing smooth boosters and provides the first separation from the setting of +distribution-independent boosting, for which the corresponding overhead is +$O(1/\gamma)$. + Our work also sheds new light on Impagliazzo's hardcore theorem from +complexity theory, all known proofs of which can be cast in the framework of +smooth boosting. For a function $f$ that is mildly hard against size-$s$ +circuits, the hardcore theorem provides a set of inputs on which $f$ is +extremely hard against size-$s'$ circuits. A downside of this important result +is the loss in circuit size, i.e. that $s' \ll s$. Answering a question of +Trevisan, we show that this size loss is necessary and in fact, the parameters +achieved by known proofs are the best possible. + +
+
+ comment: 46 pages, FOCS 2024 +
+
+
+
+
+ + ♻ ☆ The Tensor as an Informational Resource + + +
+ A tensor is a multidimensional array of numbers that can be used to store +data, encode a computational relation and represent quantum entanglement. In +this sense a tensor can be viewed as valuable resource whose transformation can +lead to an understanding of structure in data, computational complexity and +quantum information. + In order to facilitate the understanding of this resource, we propose a +family of information-theoretically constructed preorders on tensors, which can +be used to compare tensors with each other and to assess the existence of +transformations between them. The construction places copies of a given tensor +at the edges of a hypergraph and allows transformations at the vertices. A +preorder is then induced by the transformations possible in a given growing +sequence of hypergraphs. The new family of preorders generalises the asymptotic +restriction preorder which Strassen defined in order to study the computational +complexity of matrix multiplication. + We derive general properties of the preorders and their associated asymptotic +notions of tensor rank and view recent results on tensor rank non-additivity, +tensor networks and algebraic complexity in this unifying frame. We hope that +this work will provide a useful vantage point for exploring tensors in applied +mathematics, physics and computer science, but also from a purely mathematical +point of view. + +
+
+ comment: 28 pages +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 2 + +
+
+
+ + ☆ Query Learning of Advice and Nominal Automata + + +
+ Learning automata by queries is a long-studied area initiated by Angluin in +1987 with the introduction of the $L^*$ algorithm to learn regular languages, +with a large body of work afterwards on many different variations and +generalizations of DFAs. Recently, Chase and Freitag introduced a novel +approach to proving query learning bounds by computing combinatorial complexity +measures for the classes in question, which they applied to the setting of DFAs +to obtain qualitatively different results compared to the $L^*$ algorithm. +Using this approach, we prove new query learning bounds for two generalizations +of DFAs. The first setting is that of advice DFAs, which are DFAs augmented +with an advice string that informs the DFA's transition behavior at each step. +For advice DFAs, we give the first known upper bounds for query complexity. The +second setting is that of nominal DFAs, which generalize DFAs to infinite +alphabets which admit some structure via symmetries. For nominal DFAs, we make +qualitative improvements over prior results. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ Standard Automata Theory and Process Algebra + + +
+ The concepts of machine homomorphism and machine products developed in the +automata theory literature in the 1960s are more relevant to concurrent systems +than is acknowledged in the process algebra literature and offer a +sophisticated mathematical basis for understanding concurrent systems. + +
+
+ comment: fixes a number of typographical errors and sub-optimal phrasings +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Hardware Architecturea 7 + +
+
+
+ + ☆ PASS: An Asynchronous Probabilistic Processor for Next Generation + Intelligence + + +
+ New computing paradigms are required to solve the most challenging +computational problems where no exact polynomial time solution +exists.Probabilistic Ising Accelerators has gained promise on these problems +with the ability to model complex probability distributions and find ground +states of intractable problems. In this context, we have demonstrated the +Parallel Asynchronous Stochastic Sampler (PASS), the first fully on-chip +integrated, asynchronous, probabilistic accelerator that takes advantage of the +intrinsic fine-grained parallelism of the Ising Model and built in state of the +art 14nm CMOS FinFET technology. We have demonstrated broad applicability of +this accelerator on problems ranging from Combinatorial Optimization, Neural +Simulation, to Machine Learning along with up to $23,000$x energy to solution +improvement compared to CPUs on probabilistic problems. + +
+
+ comment: 13 page main text, 5 main figures, 21 pages supplementary and + methods, 7 supplementary figures, 2 supplementary tables +
+
+
+
+
+ + ☆ Count2Multiply: Reliable In-memory High-Radix Counting + + +
+ Big data processing has exposed the limits of compute-centric hardware +acceleration due to the memory-to-processor bandwidth bottleneck. Consequently, +there has been a shift towards memory-centric architectures, leveraging +substantial compute parallelism by processing using the memory elements +directly. Computing-in-memory (CIM) proposals for both conventional and +emerging memory technologies often target massively parallel operations. +However, current CIM solutions face significant challenges. For emerging +data-intensive applications, such as advanced machine learning techniques and +bioinformatics, where matrix multiplication is a key primitive, memristor +crossbars suffer from limited write endurance and expensive write operations. +In contrast, while DRAM-based solutions have successfully demonstrated +multiplication using additions, they remain prohibitively slow. This paper +introduces Count2Multiply, a technology-agnostic digital-CIM method for +performing integer-binary and integer-integer matrix multiplications using +high-radix, massively parallel counting implemented with bitwise logic +operations. In addition, Count2Multiply is designed with fault tolerance in +mind and leverages traditional scalable row-wise error correction codes, such +as Hamming and BCH codes, to protect against the high error rates of existing +CIM designs. We demonstrate Count2Multiply with a detailed application to CIM +in conventional DRAM due to its ubiquity and high endurance. We also explore +the acceleration potential of racetrack memories due to their shifting +properties, which are natural for Count2Multiply, and their high endurance. +Compared to the state-of-the-art in-DRAM method, Count2Multiply achieves up to +10x speedup, 3.8x higher GOPS/Watt, and 1.4x higher GOPS/area, while the RTM +counterpart offers gains of 10x, 57x, and 3.8x. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Enhancing Industrial Cybersecurity: SoftHSM Implementation on SBCs for + Mitigating MITM Attacks + + +
+ The rapid growth of industrial technology, driven by automation, IoT, and +cloud computing, has also increased the risk of cyberattacks, such as +Man-in-the-Middle (MITM) attacks. A standard solution to protect data is using +a Hardware Security Module (HSM), but its high implementation cost has led to +the development of a more affordable alternative: SoftHSM. This software-based +module manages encryption and decryption keys using cryptographic algorithms. +This study simulates the use of SoftHSM on a single-board computer (SBC) to +enhance industrial system security and cost-effectively mitigate MITM attacks. +The security system integrates AES and RSA cryptographic algorithms, with +SoftHSM handling RSA key storage. The results show that HSM protects RSA +private keys from extraction attempts, ensuring data security. In terms of +performance, the system achieved an average encryption time of 3.29 seconds, a +slot access time of 0.018 seconds, and a decryption time of 2.558 seconds. It +also demonstrated efficient memory usage, with 37.24% for encryption and 24.24% +for decryption, while consuming 5.20 V and 0.72 A during processing. + +
+
+
+
+
+ + ☆ High-Security Hardware Module with PUF and Hybrid Cryptography for Data + Security + + +
+ This research highlights the rapid development of technology in the industry, +particularly Industry 4.0, supported by fundamental technologies such as the +Internet of Things (IoT), cloud computing, big data, and data analysis. Despite +providing efficiency, these developments also bring negative impacts, such as +increased cyber-attacks, especially in manufacturing. One standard attack in +the industry is the man-in-the-middle (MITM) attack, which can have severe +consequences for the physical data transfer, particularly on the integrity of +sensor and actuator data in industrial machines. This research proposes a +solution by developing a hardware security module (HSM) using a +field-programmable gate array (FPGA) with physical unclonable function (PUF) +authentication and a hybrid encryption data security system. Experimental +results show that this research improves some criteria in industrial +cybersecurity, ensuring critical data security from cyber-attacks in industrial +machines. + +
+
+
+
+
+ + MARCA: Mamba Accelerator with ReConfigurable Architecture + + +
+ We propose a Mamba accelerator with reconfigurable architecture, MARCA.We +propose three novel approaches in this paper. (1) Reduction alternative PE +array architecture for both linear and element-wise operations. For linear +operations, the reduction tree connected to PE arrays is enabled and executes +the reduction operation. For element-wise operations, the reduction tree is +disabled and the output bypasses. (2) Reusable nonlinear function unit based on +the reconfigurable PE. We decompose the exponential function into element-wise +operations and a shift operation by a fast biased exponential algorithm, and +the activation function (SiLU) into a range detection and element-wise +operations by a piecewise approximation algorithm. Thus, the reconfigurable PEs +are reused to execute nonlinear functions with negligible accuracy loss.(3) +Intra-operation and inter-operation buffer management strategy. We propose +intra-operation buffer management strategy to maximize input data sharing for +linear operations within operations, and inter-operation strategy for +element-wise operations between operations. We conduct extensive experiments on +Mamba model families with different sizes.MARCA achieves up to +463.22$\times$/11.66$\times$ speedup and up to 9761.42$\times$/242.52$\times$ +energy efficiency compared to Intel Xeon 8358P CPU and NVIDIA Tesla A100 GPU +implementations, respectively. + +
+
+ comment: 9 pages, 10 figures, accepted by ICCAD 2024. arXiv admin note: text + overlap with arXiv:2001.02514 by other authors +
+
+
+
+
+ + ♻ ☆ The Impact of Run-Time Variability on Side-Channel Attacks Targeting + FPGAs + + +
+ To defeat side-channel attacks, many recent countermeasures work by enforcing +random run-time variability to the target computing platform in terms of clock +jitters, frequency and voltage scaling, and phase shift, also combining the +contributions from different actuators to maximize the side-channel resistance +of the target. However, the robustness of such solutions seems strongly +influenced by several hyper-parameters for which an in-depth analysis is still +missing. This work proposes a fine-grained dynamic voltage and frequency +scaling actuator to investigate the effectiveness of recent desynchronization +countermeasures with the goal of highlighting the link between the enforced +run-time variability and the vulnerability to side-channel attacks of +cryptographic implementations targeting FPGAs. The analysis of the results +collected from real hardware allowed for a comprehensive understanding of the +protection offered by run-time variability countermeasures against side-channel +attacks. + +
+
+ comment: Accepted for lecture presentation at 2024 31st IEEE International + Conference on Electronics, Circuits and Systems (ICECS), Nancy, France, Nov. + 18-20, 2024 +
+
+
+
+
+ + ♻ ☆ JugglePAC: a Pipelined Accumulation Circuit + + +
+ Reducing a set of numbers to a single value is a fundamental operation in +applications such as signal processing, data compression, scientific computing, +and neural networks. Accumulation, which involves summing a dataset to obtain a +single result, is crucial for these tasks. Due to hardware constraints, large +vectors or matrices often cannot be fully stored in memory and must be read +sequentially, one item per clock cycle. For high-speed inputs, such as rapidly +arriving floating-point numbers, pipelined adders are necessary to maintain +performance. However, pipelining introduces multiple intermediate sums and +requires delays between back-to-back datasets unless their processing is +overlapped. In this paper, we present JugglePAC, a novel accumulation circuit +designed to address these challenges. JugglePAC operates quickly, is +area-efficient, and features a fully pipelined design. It effectively manages +back-to-back variable-length datasets while consistently producing results in +the correct input order. Compared to the state-of-the-art, JugglePAC achieves +higher throughput and reduces area complexity, offering significant +improvements in performance and efficiency. + +
+
+ comment: 4 pages, 1 figures, 2 tables +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 17 + +
+
+
+ + ☆ TPFL: Tsetlin-Personalized Federated Learning with Confidence-Based + Clustering + + +
+ The world of Machine Learning (ML) has witnessed rapid changes in terms of +new models and ways to process users data. The majority of work that has been +done is focused on Deep Learning (DL) based approaches. However, with the +emergence of new algorithms such as the Tsetlin Machine (TM) algorithm, there +is growing interest in exploring alternative approaches that may offer unique +advantages in certain domains or applications. One of these domains is +Federated Learning (FL), in which users privacy is of utmost importance. Due to +its novelty, FL has seen a surge in the incorporation of personalization +techniques to enhance model accuracy while maintaining user privacy under +personalized conditions. In this work, we propose a novel approach dubbed TPFL: +Tsetlin-Personalized Federated Learning, in which models are grouped into +clusters based on their confidence towards a specific class. In this way, +clustering can benefit from two key advantages. Firstly, clients share only +what they are confident about, resulting in the elimination of wrongful weight +aggregation among clients whose data for a specific class may have not been +enough during the training. This phenomenon is prevalent when the data are +non-Independent and Identically Distributed (non-IID). Secondly, by sharing +only weights towards a specific class, communication cost is substantially +reduced, making TPLF efficient in terms of both accuracy and communication +cost. The results of TPFL demonstrated the highest accuracy on three different +datasets; namely MNIST, FashionMNIST and FEMNIST. + +
+
+
+
+
+ + ☆ PASS: An Asynchronous Probabilistic Processor for Next Generation + Intelligence + + +
+ New computing paradigms are required to solve the most challenging +computational problems where no exact polynomial time solution +exists.Probabilistic Ising Accelerators has gained promise on these problems +with the ability to model complex probability distributions and find ground +states of intractable problems. In this context, we have demonstrated the +Parallel Asynchronous Stochastic Sampler (PASS), the first fully on-chip +integrated, asynchronous, probabilistic accelerator that takes advantage of the +intrinsic fine-grained parallelism of the Ising Model and built in state of the +art 14nm CMOS FinFET technology. We have demonstrated broad applicability of +this accelerator on problems ranging from Combinatorial Optimization, Neural +Simulation, to Machine Learning along with up to $23,000$x energy to solution +improvement compared to CPUs on probabilistic problems. + +
+
+ comment: 13 page main text, 5 main figures, 21 pages supplementary and + methods, 7 supplementary figures, 2 supplementary tables +
+
+
+
+
+ + ☆ Maintaining Distributed Data Structures in Dynamic Peer-to-Peer Networks + + +
+ We study robust and efficient distributed algorithms for building and +maintaining distributed data structures in dynamic Peer-to-Peer (P2P) networks. +P2P networks are characterized by a high level of dynamicity with abrupt heavy +node \emph{churn} (nodes that join and leave the network continuously over +time). We present a novel algorithm that builds and maintains with high +probability a skip list for $poly(n)$ rounds despite $\mathcal{O}(n/\log n)$ +churn \emph{per round} ($n$ is the stable network size). We assume that the +churn is controlled by an oblivious adversary (that has complete knowledge and +control of what nodes join and leave and at what time and has unlimited +computational power, but is oblivious to the random choices made by the +algorithm). Moreover, the maintenance overhead is proportional to the churn +rate. Furthermore, the algorithm is scalable in that the messages are small +(i.e., at most $polylog(n)$ bits) and every node sends and receives at most +$polylog(n)$ messages per round. + Our algorithm crucially relies on novel distributed and parallel algorithms +to merge two $n$-elements skip lists and delete a large subset of items, both +in $\mathcal{O}(\log n)$ rounds with high probability. These procedures may be +of independent interest due to their elegance and potential applicability in +other contexts in distributed data structures. + To the best of our knowledge, our work provides the first-known +fully-distributed data structure that provably works under highly dynamic +settings (i.e., high churn rate). Furthermore, they are localized (i.e., do not +require any global topological knowledge). Finally, we believe that our +framework can be generalized to other distributed and dynamic data structures +including graphs, potentially leading to stable distributed computation despite +heavy churn. + +
+
+
+
+
+ + ☆ Privacy-Preserving Distributed Maximum Consensus Without Accuracy Loss + + +
+ In distributed networks, calculating the maximum element is a fundamental +task in data analysis, known as the distributed maximum consensus problem. +However, the sensitive nature of the data involved makes privacy protection +essential. Despite its importance, privacy in distributed maximum consensus has +received limited attention in the literature. Traditional privacy-preserving +methods typically add noise to updates, degrading the accuracy of the final +result. To overcome these limitations, we propose a novel distributed +optimization-based approach that preserves privacy without sacrificing +accuracy. Our method introduces virtual nodes to form an augmented graph and +leverages a carefully designed initialization process to ensure the privacy of +honest participants, even when all their neighboring nodes are dishonest. +Through a comprehensive information-theoretical analysis, we derive a +sufficient condition to protect private data against both passive and +eavesdropping adversaries. Extensive experiments validate the effectiveness of +our approach, demonstrating that it not only preserves perfect privacy but also +maintains accuracy, outperforming existing noise-based methods that typically +suffer from accuracy loss. + +
+
+
+
+
+ + ☆ Coordination-free Collaborative Replication based on Operational + Transformation + + +
+ We introduce Coordination-free Collaborative Replication (CCR), a new method +for maintaining consistency across replicas in distributed systems without +requiring explicit coordination messages. CCR automates conflict resolution, +contrasting with traditional Data-sharing systems that typically involve +centralized update management or predefined consistency rules. + Operational Transformation (OT), commonly used in collaborative editing, +ensures consistency by transforming operations while maintaining document +integrity across replicas. However, OT assumes server-based coordination, which +is unsuitable for modern, decentralized Peer-to-Peer (P2P) systems. + Conflict-free Replicated Data Type (CRDT), like Two-Phase Sets (2P-Sets), +guarantees eventual consistency by allowing commutative and associative +operations but often result in counterintuitive behaviors, such as failing to +re-add an item to a shopping cart once removed. + In contrast, CCR employs a more intuitive approach to replication. It allows +for straightforward updates and conflict resolution based on the current data +state, enhancing clarity and usability compared to CRDTs. Furthermore, CCR +addresses inefficiencies in messaging by developing a versatile protocol based +on data stream confluence, thus providing a more efficient and practical +solution for collaborative data sharing in distributed systems. + +
+
+
+
+
+ + ☆ HPC with Enhanced User Separation + + +
+ HPC systems used for research run a wide variety of software and workflows. +This software is often written or modified by users to meet the needs of their +research projects, and rarely is built with security in mind. In this paper we +explore several of the key techniques that MIT Lincoln Laboratory +Supercomputing Center has deployed on its systems to manage the security +implications of these workflows by providing enforced separation for processes, +filesystem access, network traffic, and accelerators to make every user feel +like they are running on a personal HPC. + +
+
+
+
+
+ + ☆ Advances in ArborX to support exascale applications HPCA + + +
+ ArborX is a performance portable geometric search library developed as part +of the Exascale Computing Project (ECP). In this paper, we explore a +collaboration between ArborX and a cosmological simulation code HACC. Large +cosmological simulations on exascale platforms encounter a bottleneck due to +the in-situ analysis requirements of halo finding, a problem of identifying +dense clusters of dark matter (halos). This problem is solved by using a +density-based DBSCAN clustering algorithm. With each MPI rank handling hundreds +of millions of particles, it is imperative for the DBSCAN implementation to be +efficient. In addition, the requirement to support exascale supercomputers from +different vendors necessitates performance portability of the algorithm. We +describe how this challenge problem guided ArborX development, and enhanced the +performance and the scope of the library. We explore the improvements in the +basic algorithms for the underlying search index to improve the performance, +and describe several implementations of DBSCAN in ArborX. Further, we report +the history of the changes in ArborX and their effect on the time to solve a +representative benchmark problem, as well as demonstrate the real world impact +on production end-to-end cosmology simulations. + +
+
+ comment: Submitted to IJHPCA +
+
+
+
+
+ + ☆ Deterministic Bounds in Committee Selection: Enhancing Decentralization + and Scalability in Distributed Ledgers + + +
+ Consensus plays a crucial role in distributed ledger systems, impacting both +scalability and decentralization. Many blockchain systems use a weighted +lottery based on a scarce resource such as a stake, storage, memory, or +computing power to select a committee whose members drive the consensus and are +responsible for adding new information to the ledger. Therefore, ensuring a +robust and fair committee selection process is essential for maintaining +security, efficiency, and decentralization. + There are two main approaches to randomized committee selection. In one +approach, each validator candidate locally checks whether they are elected to +the committee and reveals their proof during the consensus phase. In contrast, +in the second approach, a sortition algorithm decides a fixed-sized committee +that is globally verified. This paper focuses on the latter approach, with +cryptographic sortition as a method for fair committee selection that +guarantees a constant committee size. Our goal is to develop deterministic +guarantees that strengthen decentralization. We introduce novel methods that +provide deterministic bounds on the influence of adversaries within the +committee, as evidenced by numerical experiments. This approach overcomes the +limitations of existing protocols that only offer probabilistic guarantees, +often providing large committees that are impractical for many quorum-based +applications like atomic broadcast and randomness beacon protocols. + +
+
+
+
+
+ + ☆ A Study of Performance Programming of CPU, GPU accelerated Computers and + SIMD Architecture + + +
+ Parallel computing is a standard approach to achieving high-performance +computing (HPC). Three commonly used methods to implement parallel computing +include: 1) applying multithreading technology on single-core or multi-core +CPUs; 2) incorporating powerful parallel computing devices such as GPUs, FPGAs, +and other accelerators; and 3) utilizing special parallel architectures like +Single Instruction/Multiple Data (SIMD). + Many researchers have made efforts using different parallel technologies, +including developing applications, conducting performance analyses, identifying +performance bottlenecks, and proposing feasible solutions. However, balancing +and optimizing parallel programs remain challenging due to the complexity of +parallel algorithms and hardware architectures. Issues such as data transfer +between hosts and devices in heterogeneous systems continue to be bottlenecks +that limit performance. + This work summarizes a vast amount of information on various parallel +programming techniques, aiming to present the current state and future +development trends of parallel programming, performance issues, and solutions. +It seeks to give readers an overall picture and provide background knowledge to +support subsequent research. + +
+
+
+
+
+ + ☆ A Green Multi-Attribute Client Selection for Over-The-Air Federated + Learning: A Grey-Wolf-Optimizer Approach + + +
+ Federated Learning (FL) has gained attention across various industries for +its capability to train machine learning models without centralizing sensitive +data. While this approach offers significant benefits such as privacy +preservation and decreased communication overhead, it presents several +challenges, including deployment complexity and interoperability issues, +particularly in heterogeneous scenarios or resource-constrained environments. +Over-the-air (OTA) FL was introduced to tackle these challenges by +disseminating model updates without necessitating direct device-to-device +connections or centralized servers. However, OTA-FL brought forth limitations +associated with heightened energy consumption and network latency. In this +paper, we propose a multi-attribute client selection framework employing the +grey wolf optimizer (GWO) to strategically control the number of participants +in each round and optimize the OTA-FL process while considering accuracy, +energy, delay, reliability, and fairness constraints of participating devices. +We evaluate the performance of our multi-attribute client selection approach in +terms of model loss minimization, convergence time reduction, and energy +efficiency. In our experimental evaluation, we assessed and compared the +performance of our approach against the existing state-of-the-art methods. Our +results demonstrate that the proposed GWO-based client selection outperforms +these baselines across various metrics. Specifically, our approach achieves a +notable reduction in model loss, accelerates convergence time, and enhances +energy efficiency while maintaining high fairness and reliability indicators. + +
+
+
+
+
+ + ♻ ☆ DFDG: Data-Free Dual-Generator Adversarial Distillation for One-Shot + Federated Learning + + +
+ Federated Learning (FL) is a distributed machine learning scheme in which +clients jointly participate in the collaborative training of a global model by +sharing model information rather than their private datasets. In light of +concerns associated with communication and privacy, one-shot FL with a single +communication round has emerged as a de facto promising solution. However, +existing one-shot FL methods either require public datasets, focus on model +homogeneous settings, or distill limited knowledge from local models, making it +difficult or even impractical to train a robust global model. To address these +limitations, we propose a new data-free dual-generator adversarial distillation +method (namely DFDG) for one-shot FL, which can explore a broader local models' +training space via training dual generators. DFDG is executed in an adversarial +manner and comprises two parts: dual-generator training and dual-model +distillation. In dual-generator training, we delve into each generator +concerning fidelity, transferability and diversity to ensure its utility, and +additionally tailor the cross-divergence loss to lessen the overlap of dual +generators' output spaces. In dual-model distillation, the trained dual +generators work together to provide the training data for updates of the global +model. At last, our extensive experiments on various image classification tasks +show that DFDG achieves significant performance gains in accuracy compared to +SOTA baselines. + +
+
+ comment: Accepted by ICDM2024 main conference (long paper) +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Federated Learning with Consistency via Knowledge + Distillation Using Conditional Generator + + +
+ Federated Learning (FL) is gaining popularity as a distributed learning +framework that only shares model parameters or gradient updates and keeps +private data locally. However, FL is at risk of privacy leakage caused by +privacy inference attacks. And most existing privacy-preserving mechanisms in +FL conflict with achieving high performance and efficiency. Therefore, we +propose FedMD-CG, a novel FL method with highly competitive performance and +high-level privacy preservation, which decouples each client's local model into +a feature extractor and a classifier, and utilizes a conditional generator +instead of the feature extractor to perform server-side model aggregation. To +ensure the consistency of local generators and classifiers, FedMD-CG leverages +knowledge distillation to train local models and generators at both the latent +feature level and the logit level. Also, we construct additional classification +losses and design new diversity losses to enhance client-side training. +FedMD-CG is robust to data heterogeneity and does not require training extra +discriminators (like cGAN). We conduct extensive experiments on various image +classification tasks to validate the superiority of FedMD-CG. + +
+
+
+
+
+ + ♻ ☆ RTop-K: Ultra-Fast Row-Wise Top-K Algorithm and GPU Implementation for + Neural Networks + + +
+ Top-k algorithms are essential in various applications, from high-performance +computing and information retrieval to big data and neural network model +training. This paper introduces RTop-K, a highly efficient parallel row-wise +top-k selection algorithm designed for GPUs. RTop-K employs a Binary +Search-based approach to optimize resource allocation and provides a scalable +solution that significantly accelerates top-k operations. We perform a +theoretical analysis of the effects of early stopping in our algorithm, +demonstrating that it maintains the accuracy of neural network models while +enhancing performance. Comprehensive tests show that our GPU implementation of +RTop-K outperforms other row-wise top-k GPU implementations, with minimal +impact on testing accuracy when early stopping is applied. Notably, RTop-K +achieves speed increases ranging from 4.245$\times$ to 9.506$\times$ with early +stopping, and 3.936$\times$ without early stopping, compared to +state-of-the-art implementations. The proposed methods offer significant +improvements in the training and inference of Graph Neural Networks (GNNs), +addressing critical challenges in latency and throughput on GPU platforms. + +
+
+ comment: Need to improve the experiment part +
+
+
+
+
+ + ♻ ☆ Scalable Distributed Algorithms for Size-Constrained Submodular + Maximization in the MapReduce and Adaptive Complexity Models + + +
+ Distributed maximization of a submodular function in the MapReduce (MR) model +has received much attention, culminating in two frameworks that allow a +centralized algorithm to be run in the MR setting without loss of +approximation, as long as the centralized algorithm satisfies a certain +consistency property -- which had previously only been known to be satisfied by +the standard greedy and continous greedy algorithms. A separate line of work +has studied parallelizability of submodular maximization in the adaptive +complexity model, where each thread may have access to the entire ground set. +For the size-constrained maximization of a monotone and submodular function, we +show that several sublinearly adaptive (highly parallelizable) algorithms +satisfy the consistency property required to work in the MR setting, which +yields practical, parallelizable and distributed algorithms. Separately, we +develop the first distributed algorithm with linear query complexity for this +problem. Finally, we provide a method to increase the maximum cardinality +constraint for MR algorithms at the cost of additional MR rounds. + +
+
+
+
+
+ + ♻ ☆ Local Methods with Adaptivity via Scaling + + +
+ The rapid development of machine learning and deep learning has introduced +increasingly complex optimization challenges that must be addressed. Indeed, +training modern, advanced models has become difficult to implement without +leveraging multiple computing nodes in a distributed environment. Distributed +optimization is also fundamental to emerging fields such as federated learning. +Specifically, there is a need to organize the training process to minimize the +time lost due to communication. A widely used and extensively researched +technique to mitigate the communication bottleneck involves performing local +training before communication. This approach is the focus of our paper. +Concurrently, adaptive methods that incorporate scaling, notably led by Adam, +have gained significant popularity in recent years. Therefore, this paper aims +to merge the local training technique with the adaptive approach to develop +efficient distributed learning methods. We consider the classical Local SGD +method and enhance it with a scaling feature. A crucial aspect is that the +scaling is described generically, allowing us to analyze various approaches, +including Adam, RMSProp, and OASIS, in a unified manner. In addition to +theoretical analysis, we validate the performance of our methods in practice by +training a neural network. + +
+
+ comment: 41 pages, 2 algorithms, 6 figures, 1 table +
+
+
+
+
+ + ♻ ☆ DFDG: Data-Free Dual-Generator Adversarial Distillation for One-Shot + Federated Learning + + +
+ Federated Learning (FL) is a distributed machine learning scheme in which +clients jointly participate in the collaborative training of a global model by +sharing model information rather than their private datasets. In light of +concerns associated with communication and privacy, one-shot FL with a single +communication round has emerged as a de facto promising solution. However, +existing one-shot FL methods either require public datasets, focus on model +homogeneous settings, or distill limited knowledge from local models, making it +difficult or even impractical to train a robust global model. To address these +limitations, we propose a new data-free dual-generator adversarial distillation +method (namely DFDG) for one-shot FL, which can explore a broader local models' +training space via training dual generators. DFDG is executed in an adversarial +manner and comprises two parts: dual-generator training and dual-model +distillation. In dual-generator training, we delve into each generator +concerning fidelity, transferability and diversity to ensure its utility, and +additionally tailor the cross-divergence loss to lessen the overlap of dual +generators' output spaces. In dual-model distillation, the trained dual +generators work together to provide the training data for updates of the global +model. At last, our extensive experiments on various image classification tasks +show that DFDG achieves significant performance gains in accuracy compared to +SOTA baselines. + +
+
+ comment: Accepted by ICDM2024 main conference (long paper). arXiv admin note: + substantial text overlap with arXiv:2309.13546 +
+
+
+
+
+ + ♻ ☆ SABLE: Staging Blocked Evaluation of Sparse Matrix Computations + + +
+ Sparse Matrices found in the real world often have some structure in their +distribution of dense elements. While existing techniques specialize the +generated code for the structure of matrices, their generality misses +optimization opportunities. We propose a system that -- if the sparse matrix is +stored in a blocked storage format -- can adapt its code generation strategy +depending on the structure of the sparse matrix. Our system SABLE performs a +specified computation over every element of {\em mostly} dense blocks instead +of avoiding computing any sparse element and achieving regularity in generated +code while having special treatment for hyper-sparse blocks (ie, blocks with +very few dense elements). SABLE is extensible, providing a block iterator for +users to express any computation over these non-empty blocks. We demonstrate +that our approach can significantly accelerate SpMV and SpMM operations, +surpassing the performance of state-of-the-art systems like Partially-Strided +Codelets and Sparse Register Tiling. + +
+
+
+
+
+
+
+
+ + Programming and Languages 2 + +
+
+
+ + ☆ High-level quantum algorithm programming using Silq + + +
+ Quantum computing, with its vast potential, is fundamentally shaped by the +intricacies of quantum mechanics, which both empower and constrain its +capabilities. The development of a universal, robust quantum programming +language has emerged as a key research focus in this rapidly evolving field. +This paper explores Silq, a recent high-level quantum programming language, +highlighting its strengths and unique features. We aim to share our insights on +designing and implementing high-level quantum algorithms using Silq, +demonstrating its practical applications and advantages for quantum +programming. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Coordination-free Collaborative Replication based on Operational + Transformation + + +
+ We introduce Coordination-free Collaborative Replication (CCR), a new method +for maintaining consistency across replicas in distributed systems without +requiring explicit coordination messages. CCR automates conflict resolution, +contrasting with traditional Data-sharing systems that typically involve +centralized update management or predefined consistency rules. + Operational Transformation (OT), commonly used in collaborative editing, +ensures consistency by transforming operations while maintaining document +integrity across replicas. However, OT assumes server-based coordination, which +is unsuitable for modern, decentralized Peer-to-Peer (P2P) systems. + Conflict-free Replicated Data Type (CRDT), like Two-Phase Sets (2P-Sets), +guarantees eventual consistency by allowing commutative and associative +operations but often result in counterintuitive behaviors, such as failing to +re-add an item to a shopping cart once removed. + In contrast, CCR employs a more intuitive approach to replication. It allows +for straightforward updates and conflict resolution based on the current data +state, enhancing clarity and usability compared to CRDTs. Furthermore, CCR +addresses inefficiencies in messaging by developing a versatile protocol based +on data stream confluence, thus providing a more efficient and practical +solution for collaborative data sharing in distributed systems. + +
+
+
+
+
+
+
+
+ + Computational Complexity 6 + +
+
+
+ + ☆ New Direct Sum Tests + + +
+ A function $f:[n]^{d} \to \mathbb{F}_2$ is a \defn{direct sum} if there are +functions $L_i:[n]\to \mathbb{F}_2$ such that ${f(x) = \sum_{i}L_i(x_i)}$. In +this work we give multiple results related to the property testing of direct +sums. + Our first result concerns a test proposed by Dinur and Golubev in 2019. We +call their test the Diamond test and show that it is indeed a direct sum +tester. More specifically, we show that if a function $f$ is $\epsilon$-far +from being a direct sum function, then the Diamond test rejects $f$ with +probability at least $\Omega_{n,\epsilon}(1)$. Even in the case of $n = 2$, the +Diamond test is, to the best of our knowledge, novel and yields a new tester +for the classic property of affinity. + Apart from the Diamond test, we also analyze a broad family of direct sum +tests, which at a high level, run an arbitrary affinity test on the restriction +of $f$ to a random hypercube inside of $[n]^d$. This family of tests includes +the direct sum test analyzed in \cite{di19}, but does not include the Diamond +test. As an application of our result, we obtain a direct sum test which works +in the online adversary model of \cite{KRV}. + Finally, we also discuss a Fourier analytic interpretation of the diamond +tester in the $n=2$ case, as well as prove local correction results for direct +sum as conjectured by Dinur and Golubev. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Efficient approximation schemes for scheduling on a stochastic number of + machines + + +
+ We study three two-stage optimization problems with a similar structure and +different objectives. In the first stage of each problem, the goal is to assign +input jobs of positive sizes to unsplittable bags. After this assignment is +decided, the realization of the number of identical machines that will be +available is revealed. Then, in the second stage, the bags are assigned to +machines. The probability vector of the number of machines in the second stage +is known to the algorithm as part of the input before making the decisions of +the first stage. Thus, the vector of machine completion times is a random +variable. The goal of the first problem is to minimize the expected value of +the makespan of the second stage schedule, while the goal of the second problem +is to maximize the expected value of the minimum completion time of the +machines in the second stage solution. The goal of the third problem is to +minimize the \ell_p norm for a fixed p>1, where the norm is applied on +machines' completion times vectors. Each one of the first two problems admits a +PTAS as Buchem et al. showed recently. Here we significantly improve all their +results by designing an EPTAS for each one of these problems. We also design an +EPTAS for \ell_p norm minimization for any p>1. + +
+
+
+
+
+ + ♻ ☆ On the Complexity of Minimizing Energy Consumption of Partitioning DAG + Tasks + + +
+ We study a graph partition problem where we are given a directed acyclic +graph (DAG) whose vertices and arcs can be respectively regarded as tasks and +dependencies among tasks. The objective of the problem is to minimize the total +energy consumed for completing these tasks by assigning the tasks to k +heterogeneous machines. We first show that the problem is NP-hard. Then, we +present polynomial-time algorithms for two special cases where there are only +two machines and where the input DAG is a directed path. Finally, we study a +natural variant where there are only two machines with one of them being +capable of executing a limited number of tasks. We show that this special case +remains computationally hard. + +
+
+
+
+
+ + ♻ ☆ A Reply to "On Salum's Algorithm for X3SAT" + + +
+ This paper is a reply to "On Salum's Algorithm for X3SAT" (arXiv:2104.02886) + +
+
+
+
+
+ + ♻ ☆ On $NP$ versus ${\rm co}NP$ and Frege Systems + + +
+ We prove in this paper that there is a language $L_d$ accepted by some +nondeterministic Turing machines but not by any ${\rm co}\mathcal{NP}$-machines +(defined later). Then we further show that $L_d$ is in $\mathcal{NP}$, thus +proving that $\mathcal{NP}\neq{\rm co}\mathcal{NP}$. The techniques used in +this paper are lazy-diagonalization and the novel new technique developed in +author's recent work \cite{Lin21}. As a by-product, we reach the important +result that $\mathcal{P}\neq\mathcal{NP}$ \cite{Lin21} once again, which is +clear from the above outcome and the well-known fact that $\mathcal{P}={\rm +co}\mathcal{P}$. Next, we show that the complexity class ${\rm co}\mathcal{NP}$ +has intermediate languages, i.e., there are language $L_{inter}\in{\rm +co}\mathcal{NP}$ which is not in $\mathcal{P}$ and not ${\rm +co}\mathcal{NP}$-complete. We also summarize other direct consequences such as +$\mathcal{NEXP}\neq{\rm co}\mathcal{NEXP}$ and other which is in the area of +proof complexity implied by our main outcome. Lastly, we show a lower bounds +result for Frege proof systems, i.e., no Frege proof systems can be polynomial +bounded. + +
+
+ comment: [v4] 30 pages; further improved; arXiv admin note: text overlap with + arXiv:2110.06211 +
+
+
+
+
+ + ♻ ☆ Semidefinite programming and linear equations vs. homomorphism problems + + +
+ We introduce a relaxation for homomorphism problems that combines +semidefinite programming with linear Diophantine equations, and propose a +framework for the analysis of its power based on the spectral theory of +association schemes. We use this framework to establish an unconditional lower +bound against the semidefinite programming + linear equations model, by showing +that the relaxation does not solve the approximate graph homomorphism problem +and thus, in particular, the approximate graph colouring problem. + +
+
+
+
+
+
+
+
+ + Logic in Computer Science 4 + +
+
+
+ + ☆ Directed equality with dinaturality + + +
+ We show how dinaturality plays a central role in the interpretation of +directed type theory where types are interpreted as (1-)categories and directed +equality is represented by $\hom$-functors. We present a general elimination +principle based on dinaturality for directed equality which very closely +resembles the $J$-rule used in Martin-L\"of type theory, and we highlight which +syntactical restrictions are needed to interpret this rule in the context of +directed equality. We then use these rules to characterize directed equality as +a left relative adjoint to a functor between (para)categories of dinatural +transformations which contracts together two variables appearing naturally with +a single dinatural one, with the relative functor imposing the syntactic +restrictions needed. We then argue that the quantifiers of such a directed type +theory should be interpreted as ends and coends, which dinaturality allows us +to present in adjoint-like correspondences to a weakening functor. Using these +rules we give a formal interpretation to Yoneda reductions and (co)end +calculus, and we use logical derivations to prove the Fubini rule for +quantifier exchange, the adjointness property of Kan extensions via (co)ends, +exponential objects of presheaves, and the (co)Yoneda lemma. We show +transitivity (composition), congruence (functoriality), and transport +(coYoneda) for directed equality by closely following the same approach of +Martin-L\"of type theory, with the notable exception of symmetry. We formalize +our main theorems in Agda. + +
+
+
+
+
+ + ☆ Minimal Model Counting via Knowledge Compilation + + +
+ Counting the number of models of a Boolean formula is a fundamental problem +in artificial intelligence and reasoning. Minimal models of a Boolean formula +are critical in various reasoning systems, making the counting of minimal +models essential for detailed inference tasks. Existing research primarily +focused on decision problems related to minimal models. In this work, we extend +beyond decision problems to address the challenge of counting minimal models. +Specifically, we propose a novel knowledge compilation form that facilitates +the efficient counting of minimal models. Our approach leverages the idea of +justification and incorporates theories from answer set counting. + +
+
+
+
+
+ + ♻ ☆ Decidability of Querying First-Order Theories via Countermodels of + Finite Width + + +
+ We propose a generic framework for establishing the decidability of a wide +range of logical entailment problems (briefly called querying), based on the +existence of countermodels that are structurally simple, gauged by certain +types of width measures (with treewidth and cliquewidth as popular examples). +As an important special case of our framework, we identify logics exhibiting +width-finite finitely universal model sets, warranting decidable entailment for +a wide range of homomorphism-closed queries, subsuming a diverse set of +practically relevant query languages. As a particularly powerful width measure, +we propose to employ Blumensath's partitionwidth, which subsumes various other +commonly considered width measures and exhibits highly favorable computational +and structural properties. Focusing on the formalism of existential rules as a +popular showcase, we explain how finite partitionwidth sets of rules subsume +other known abstract decidable classes but - leveraging existing notions of +stratification - also cover a wide range of new rulesets. We expose natural +limitations for fitting the class of finite unification sets into our picture +and suggest several options for remedy. + +
+
+
+
+
+ + ♻ ☆ Extensional Taylor Expansion + + +
+ We introduce a calculus of extensional resource terms. These are resource +terms \`a la Ehrhard-Regnier, but in infinitely eta-long form. The calculus +still retains a finite syntax and dynamics: in particular, we prove strong +confluence and normalization. + Then we define an extensional version of Taylor expansion, mapping ordinary +lambda-terms to (possibly infinite) linear combinations of extensional resource +terms: like in the ordinary case, the dynamics of our resource calculus allows +us to simulate the beta-reduction of lambda-terms; the extensional nature of +this expansion shows in the fact that we are also able to simulate +eta-reduction. + In a sense, extensional resource terms contain a language of finite +approximants of Nakajima trees, much like ordinary resource terms can be seen +as a richer version of finite B\"ohm trees. We show that the equivalence +induced on lambda-terms by the normalization of extensional Taylor-expansion is +nothing but H*, the greatest consistent sensible lambda-theory - which is also +the theory induced by Nakajima trees. This characterization provides a new, +simple way to exhibit models of H*: it becomes sufficient to model the +extensional resource calculus and its dynamics. + The extensional resource calculus moreover allows us to recover, in an +untyped setting, a connection between Taylor expansion and game semantics that +was previously limited to the typed setting. Indeed, simply typed, eta-long, +beta-normal resource terms are known to be in bijective correspondence with +plays in the sense of Hyland-Ong game semantics, up to Melli\`es' homotopy +equivalence. Extensional resource terms are the appropriate counterpart of +eta-long resource terms in an untyped setting: we spell out the bijection +between normal extensional resource terms and isomorphism classes of +augmentations (a canonical presentation of plays up to homotopy) in the +universal arena. + +
+
+
+
+
+
+
+
+ + Performance Profiling 1 + +
+
+
+ + ♻ ☆ SABLE: Staging Blocked Evaluation of Sparse Matrix Computations + + +
+ Sparse Matrices found in the real world often have some structure in their +distribution of dense elements. While existing techniques specialize the +generated code for the structure of matrices, their generality misses +optimization opportunities. We propose a system that -- if the sparse matrix is +stored in a blocked storage format -- can adapt its code generation strategy +depending on the structure of the sparse matrix. Our system SABLE performs a +specified computation over every element of {\em mostly} dense blocks instead +of avoiding computing any sparse element and achieving regularity in generated +code while having special treatment for hyper-sparse blocks (ie, blocks with +very few dense elements). SABLE is extensible, providing a block iterator for +users to express any computation over these non-empty blocks. We demonstrate +that our approach can significantly accelerate SpMV and SpMM operations, +surpassing the performance of state-of-the-art systems like Partially-Strided +Codelets and Sparse Register Tiling. + +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 1 + +
+
+
+ + ♻ ☆ Efficient Analysis of Unambiguous Automata Using Matrix Semigroup + Techniques + + +
+ We introduce a novel technique to analyse unambiguous B\"uchi automata +quantitatively, and apply this to the model checking problem. It is based on +linear-algebra arguments that originate from the analysis of matrix semigroups +with constant spectral radius. This method can replace a combinatorial +procedure that dominates the computational complexity of the existing procedure +by Baier et al. We analyse the complexity in detail, showing that, in terms of +the set $Q$ of states of the automaton, the new algorithm runs in time +$O(|Q|^4)$, improving on an efficient implementation of the combinatorial +algorithm by a factor of $|Q|$. + +
+
+ comment: Technical report for an MFCS'19 paper. This version fixes a bug in + Appendix A +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computational Complexity 4 + +
+
+
+ + ☆ Complexity and algorithms for Swap median and relation to other + consensus problems + + +
+ Genome rearrangements are events in which large blocks of DNA exchange pieces +during evolution. The analysis of such events is a tool for understanding +evolutionary genomics, based on finding the minimum number of rearrangements to +transform one genome into another. In a general scenario, more than two genomes +are considered and we have new challenges. The {\sc Median} problem consists in +finding, given three permutations and a distance metric, a permutation $s$ that +minimizes the sum of the distances between $s$ and each input. We study the +{\sc median} problem over \emph{swap} distances in permutations, for which the +computational complexity has been open for almost 20 years (Eriksen, +\emph{Theor. Compt. Sci.}, 2007). We consider this problem through some +branches. We associate median solutions and interval convex sets, where the +concept of graph convexity inspires the following investigation: Does a median +permutation belong to every shortest path between one of the pairs of input +permutations? We are able to partially answer this question, and as a +by-product we solve a long open problem by proving that the {\sc Swap Median} +problem is NP-hard. Furthermore, using a similar approach, we show that the +{\sc Closest} problem, which seeks to minimize the maximum distance between the +solution and the input permutations, is NP-hard even considering three input +permutations. This gives a sharp dichotomy into the P vs. NP-hard approaches, +since considering two input permutations the problem is easily solvable and +considering any number of input permutations it is known to be NP-hard since +2007 (Popov, \emph{Theor. Compt. Sci.}, 2007). In addition, we show that {\sc +Swap Median} and {\sc Swap Closest} are APX-hard problems. + +
+
+
+
+
+ + ♻ ☆ Solving promise equations over monoids and groups + + +
+ We give a complete complexity classification for the problem of finding a +solution to a given system of equations over a fixed finite monoid, given that +a solution over a more restricted monoid exists. As a corollary, we obtain a +complexity classification for the same problem over groups. + +
+
+ comment: Full version of an ICALP 2024 paper +
+
+
+
+
+ + ♻ ☆ The Stochastic Arrival Problem + + +
+ We study a new modification of the Arrival problem, which allows for nodes +that exhibit random as well as controlled behaviour, in addition to switching +nodes. We study the computational complexity of these extensions, building on +existing work on Reachability Switching Games. In particular, we show for +versions of the arrival problem involving just switching and random nodes it is +\PP{}-hard to decide if their value is greater than a half and we give a PSPACE +decision algorithm. + +
+
+
+
+
+ + ♻ ☆ The n-vehicle exploration problem is NP-complete + + +
+ The $n$-vehicle exploration problem (NVEP) is a nonlinear unconstrained +optimization problem. Given a fleet of $n$ vehicles with mid-trip refueling +technique, the NVEP tries to find a sequence of $n$ vehicles to make one of the +vehicles travel the farthest, and at last all the vehicles return to the start +point. NVEP has a fractional form of objective function, and its computational +complexity of general case remains open. Given a directed graph $G$, it can be +reduced in polynomial time to an instance of NVEP. We prove that the graph $G$ +has a hamiltonian path if and only if the reduced NVEP instance has a feasible +sequence of length at least $n$. Therefore we show that Hamiltonian path +$\leq_P$ NVEP, and consequently prove that NVEP is NP-complete. + +
+
+ comment: 5 pages, no figure +
+
+
+
+
+
+
+
+ + Logic in Computer Science 3 + +
+
+
+ + ♻ ☆ Solving promise equations over monoids and groups + + +
+ We give a complete complexity classification for the problem of finding a +solution to a given system of equations over a fixed finite monoid, given that +a solution over a more restricted monoid exists. As a corollary, we obtain a +complexity classification for the same problem over groups. + +
+
+ comment: Full version of an ICALP 2024 paper +
+
+
+
+
+ + ♻ ☆ Specifying a Game-Theoretic Extensive Form as an Abstract 5-ary Relation + + +
+ This paper specifies an extensive form as a 5-ary relation (that is, as a set +of quintuples) which satisfies eight abstract axioms. Each quintuple is +understood to list a player, a situation (that is, a name for an information +set), a decision node, an action, and a successor node. Accordingly, the axioms +are understood to specify abstract relationships between players, situations, +nodes, and actions. Such an extensive form is called a "pentaform". Finally, a +"pentaform game" is defined to be a pentaform together with utility functions. + To ground this new specification in the literature, the paper defines the +concept of a "traditional game" to represent the literature's many +specifications of finite-horizon and infinite-horizon games. The paper's main +result is to construct an intuitive bijection between pentaform games and +traditional games. Secondary results concern disaggregating pentaforms by +subsets, constructing pentaforms by unions, and initial pentaform applications +to Selten subgames and perfect-recall (an extensive application to dynamic +programming is in Streufert 2023, arXiv:2302.03855). + +
+
+ comment: 53 pages, 9 figures. This version 6 makes small editorial changes. + Version 5 had essentially the same results as Version 4, but with improved + exposition and appendices. Version 4 merely updated cross-references, while + Version 3 was extensively rewritten with new tools and applications. Version + 1 is Western University Department of Economics Research Report 2021-3 +
+
+
+
+
+ + ♻ ☆ From Width-Based Model Checking to Width-Based Automated Theorem Proving + + +
+ In the field of parameterized complexity theory, the study of graph width +measures has been intimately connected with the development of width-based +model checking algorithms for combinatorial properties on graphs. In this work, +we introduce a general framework to convert a large class of width-based +model-checking algorithms into algorithms that can be used to test the validity +of graph-theoretic conjectures on classes of graphs of bounded width. Our +framework is modular and can be applied with respect to several well-studied +width measures for graphs, including treewidth and cliquewidth. + As a quantitative application of our framework, we prove analytically that +for several long-standing graph-theoretic conjectures, there exists an +algorithm that takes a number $k$ as input and correctly determines in time +double-exponential in $k^{O(1)}$ whether the conjecture is valid on all graphs +of treewidth at most $k$. These upper bounds, which may be regarded as +upper-bounds on the size of proofs/disproofs for these conjectures on the class +of graphs of treewidth at most $k$, improve significantly on theoretical upper +bounds obtained using previously available techniques. + +
+
+ comment: A preliminary version of this work was published in the proceedings + of AAAI 2023 +
+
+
+
+
+
+
+
+ + Hardware Architecturea 4 + +
+
+
+ + ☆ CAT: Customized Transformer Accelerator Framework on Versal ACAP + + +
+ Transformer uses GPU as the initial design platform, but GPU can only perform +limited hardware customization. Although FPGA has strong customization ability, +the design solution space is huge and the design difficulty is high. Versal +ACAP is a heterogeneous computing architecture with AI Engine as the core. It +is far more flexible than GPU in hardware customization, and has better and +smaller design solution space than traditional FPGA. Therefore, this paper +proposes the Customized Transformer Accelerator Framework(CAT), through the CAT +framework, a customized Transformer accelerator family can be derived on Versal +ACAP, CAT framework has an abstract accelerator architecture design idea, which +deconstructs and efficiently maps the Transformer into the hardware, which +contains a variety of customizable properties. Through the customization and +optimization strategy of the CAT framework, the underlying hardware and the +upper model jointly constrain and decide on these customizable properties, and +finally form a customized accelerator. We use a 7 nm AMD Versal ACAP VCK5000 +development board to implement accelerators for different Transformer models +based on the CAT framework. Experiments show that we achieve the highest +throughput gains of 2.41x, 49.50x, and 1.32x compared to 8 nm Nvidia GPU A10G, +16 nm AMD FPGA ZCU102, and 7 nm AMD Versal ACAP VC190(SOTA). The highest energy +efficiency gains are 7.80x, 6.19x and 1.15x, respectively. + +
+
+
+
+
+ + ☆ Pack my weights and run! Minimizing overheads for in-memory computing + accelerators + + +
+ In-memory computing hardware accelerators allow more than 10x improvements in +peak efficiency and performance for matrix-vector multiplications (MVM) +compared to conventional digital designs. For this, they have gained great +interest for the acceleration of neural network workloads. Nevertheless, these +potential gains are only achieved when the utilization of the computational +resources is maximized and the overhead from loading operands in the memory +array minimized. To this aim, this paper proposes a novel mapping algorithm for +the weights in the IMC macro, based on efficient packing of the weights of +network layers in the available memory. The algorithm realizes 1) minimization +of weight loading times while at the same time 2) maximally exploiting the +parallelism of the IMC computational fabric. A set of case studies are carried +out to show achievable trade-offs for the MLPerf Tiny benchmark +\cite{mlperftiny} on IMC architectures, with potential $10-100\times$ EDP +improvements. + +
+
+ comment: 7 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Time-Series Forecasting and Sequence Learning Using Memristor-based + Reservoir System + + +
+ Pushing the frontiers of time-series information processing in the +ever-growing domain of edge devices with stringent resources has been impeded +by the systems' ability to process information and learn locally on the device. +Local processing and learning of time-series information typically demand +intensive computations and massive storage as the process involves retrieving +information and tuning hundreds of parameters back in time. In this work, we +developed a memristor-based echo state network accelerator that features +efficient temporal data processing and in-situ online learning. The proposed +design is benchmarked using various datasets involving real-world tasks, such +as forecasting the load energy consumption and weather conditions. The +experimental results illustrate that the hardware model experiences a marginal +degradation in performance as compared to the software counterpart. This is +mainly attributed to the limited precision and dynamic range of network +parameters when emulated using memristor devices. The proposed system is +evaluated for lifespan, robustness, and energy-delay product. It is observed +that the system demonstrates reasonable robustness for device failure below +10%, which may occur due to stuck-at faults. Furthermore, 247X reduction in +energy consumption is achieved when compared to a custom CMOS digital design +implemented at the same technology node. + +
+
+
+
+
+ + ♻ ☆ PolyLUT-Add: FPGA-based LUT Inference with Wide Inputs + + +
+ FPGAs have distinct advantages as a technology for deploying deep neural +networks (DNNs) at the edge. Lookup Table (LUT) based networks, where neurons +are directly modeled using LUTs, help maximize this promise of offering +ultra-low latency and high area efficiency on FPGAs. Unfortunately, LUT +resource usage scales exponentially with the number of inputs to the LUT, +restricting PolyLUT to small LUT sizes. This work introduces PolyLUT-Add, a +technique that enhances neuron connectivity by combining $A$ PolyLUT +sub-neurons via addition to improve accuracy. Moreover, we describe a novel +architecture to improve its scalability. We evaluated our implementation over +the MNIST, Jet Substructure classification, and Network Intrusion Detection +benchmark and found that for similar accuracy, PolyLUT-Add achieves a LUT +reduction of $2.0-13.9\times$ with a $1.2-1.6\times$ decrease in latency. + +
+
+ comment: The source code for this paper is available at: + https://github.com/bingleilou/PolyLUT-Add +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 5 + +
+
+
+ + ☆ Leiden-Fusion Partitioning Method for Effective Distributed Training of + Graph Embeddings + + +
+ In the area of large-scale training of graph embeddings, effective training +frameworks and partitioning methods are critical for handling large networks. +However, they face two major challenges: 1) existing synchronized distributed +frameworks require continuous communication to access information from other +machines, and 2) the inability of current partitioning methods to ensure that +subgraphs remain connected components without isolated nodes, which is +essential for effective training of GNNs since training relies on information +aggregation from neighboring nodes. To address these issues, we introduce a +novel partitioning method, named Leiden-Fusion, designed for large-scale +training of graphs with minimal communication. Our method extends the Leiden +community detection algorithm with a greedy algorithm that merges the smallest +communities with highly connected neighboring communities. Our method +guarantees that, for an initially connected graph, each partition is a densely +connected subgraph with no isolated nodes. After obtaining the partitions, we +train a GNN for each partition independently, and finally integrate all +embeddings for node classification tasks, which significantly reduces the need +for network communication and enhances the efficiency of distributed graph +training. We demonstrate the effectiveness of our method through extensive +evaluations on several benchmark datasets, achieving high efficiency while +preserving the quality of the graph embeddings for node classification tasks. + +
+
+ comment: Accepted at the 2024 European Conference on Machine Learning and + Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD 2024) +
+
+
+
+
+ + ☆ The Landscape of GPU-Centric Communication + + +
+ n recent years, GPUs have become the preferred accelerators for HPC and ML +applications due to their parallelism and fast memory bandwidth. While GPUs +boost computation, inter-GPU communication can create scalability bottlenecks, +especially as the number of GPUs per node and cluster grows. Traditionally, the +CPU managed multi-GPU communication, but advancements in GPU-centric +communication now challenge this CPU dominance by reducing its involvement, +granting GPUs more autonomy in communication tasks, and addressing mismatches +in multi-GPU communication and computation. + This paper provides a landscape of GPU-centric communication, focusing on +vendor mechanisms and user-level library supports. It aims to clarify the +complexities and diverse options in this field, define the terminology, and +categorize existing approaches within and across nodes. The paper discusses +vendor-provided mechanisms for communication and memory management in multi-GPU +execution and reviews major communication libraries, their benefits, +challenges, and performance insights. Then, it explores key research paradigms, +future outlooks, and open research questions. By extensively describing +GPU-centric communication techniques across the software and hardware stacks, +we provide researchers, programmers, engineers, and library designers insights +on how to exploit multi-GPU systems at their best. + +
+
+
+
+
+ + ☆ Federated Learning in Adversarial Environments: Testbed Design and + Poisoning Resilience in Cybersecurity + + +
+ This paper presents the design and implementation of a Federated Learning +(FL) testbed, focusing on its application in cybersecurity and evaluating its +resilience against poisoning attacks. Federated Learning allows multiple +clients to collaboratively train a global model while keeping their data +decentralized, addressing critical needs for data privacy and security, +particularly in sensitive fields like cybersecurity. Our testbed, built using +the Flower framework, facilitates experimentation with various FL frameworks, +assessing their performance, scalability, and ease of integration. Through a +case study on federated intrusion detection systems, we demonstrate the +testbed's capabilities in detecting anomalies and securing critical +infrastructure without exposing sensitive network data. Comprehensive poisoning +tests, targeting both model and data integrity, evaluate the system's +robustness under adversarial conditions. Our results show that while federated +learning enhances data privacy and distributed learning, it remains vulnerable +to poisoning attacks, which must be mitigated to ensure its reliability in +real-world applications. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ ACC Saturator: Automatic Kernel Optimization for Directive-Based GPU + Code + + +
+ Automatic code optimization is a complex process that typically involves the +application of multiple discrete algorithms that modify the program structure +irreversibly. However, the design of these algorithms is often monolithic, and +they require repetitive implementation to perform similar analyses due to the +lack of cooperation. To address this issue, modern optimization techniques, +such as equality saturation, allow for exhaustive term rewriting at various +levels of inputs, thereby simplifying compiler design. + In this paper, we propose equality saturation to optimize sequential codes +utilized in directive-based programming for GPUs. Our approach realizes less +computation, less memory access, and high memory throughput simultaneously. Our +fully-automated framework constructs single-assignment forms from inputs to be +entirely rewritten while keeping dependencies and extracts optimal cases. +Through practical benchmarks, we demonstrate a significant performance +improvement on several compilers. Furthermore, we highlight the advantages of +computational reordering and emphasize the significance of memory-access order +for modern GPUs. + +
+
+ comment: To appear in: Proceedings of Eleventh Workshop on Accelerator + Programming and Directives (WACCPD 2024) +
+
+
+
+
+ + ♻ ☆ Aegis: A Decentralized Expansion Blockchain + + +
+ Blockchains implement monetary systems operated by committees of nodes. The +robustness of established blockchains presents an opportunity to leverage their +infrastructure for creating expansion chains. Expansion chains can provide +additional functionality to the primary chain they leverage or implement +separate functionalities, while benefiting from the primary chain's security +and the stability of its tokens. Indeed, tools like Ethereum's EigenLayer +enable nodes to stake (deposit collateral) on a primary chain to form a +committee responsible for operating an expansion chain. + But here is the rub. Classical protocols assume correct, well-behaved nodes +stay correct indefinitely. Yet in our case, the stake incentivizes +correctness--it will be slashed (revoked) if its owner deviates. Once a node +withdraws its stake, there is no basis to assume its correctness. + To address the new challenge, we present Aegis, an expansion chain based on +primary-chain stake, assuming a bounded primary-chain write time. Aegis uses +references from Aegis blocks to primary blocks to define committees, +checkpoints on the primary chain to perpetuate decisions, and resets on the +primary chain to establish a new committee if the previous one becomes +obsolete. It ensures safety at all times and rapid progress when latency among +Aegis nodes is low. + +
+
+
+
+
+
+
+
+ + Programming and Languages 1 + +
+
+
+ + ♻ ☆ Modal Abstractions for Virtualizing Memory Addresses + + +
+ Operating system kernels employ virtual memory subsystems, which use a CPU's +memory management units (MMUs) to virtualize the addresses of memory regions +Operating systems manipulate these virtualized memory mappings to isolate +untrusted processes, restrict which memory is accessible to different +processes, hide memory limits from user programs, ensure process isolation, +implement demand-paging and copy-on-write behaviors for performance and +resource controls. + Virtual memory management (VMM) code is a critical piece of general-purpose +OS kernels, but verification of this functionality is challenging due to the +complexity of the hardware interface. In this paper, we introduce a modal +abstraction to describe the truth of assertions relative to a specific virtual +address space: [r]P indicating that P holds in the virtual address space rooted +at r. Such modal assertions allow different address spaces to refer to each +other, enabling complete verification of instruction sequences manipulating +multiple address spaces. Using them effectively requires working with other +assertions, such as points-to assertions in our separation logic, as relative +to a given address space. We therefore define virtual points-to relations, +which mimic hardware address translation, relative to a page table root. We +demonstrate our approach with challenging fragments of VMM code showing that +our approach handles examples beyond what prior work can address, including +reasoning about a sequence of instructions as it changes address spaces. All +definitions and theorems mentioned in this paper including the operational +model of a RISC-like fragment of x86-64, a simple language run on this +operational model, and a logic as an instantiation of the Iris framework are +mechanized inside Coq. + +
+
+
+
+
+
+
+
+ + Performance Profiling 2 + +
+
+
+ + ☆ The Landscape of GPU-Centric Communication + + +
+ n recent years, GPUs have become the preferred accelerators for HPC and ML +applications due to their parallelism and fast memory bandwidth. While GPUs +boost computation, inter-GPU communication can create scalability bottlenecks, +especially as the number of GPUs per node and cluster grows. Traditionally, the +CPU managed multi-GPU communication, but advancements in GPU-centric +communication now challenge this CPU dominance by reducing its involvement, +granting GPUs more autonomy in communication tasks, and addressing mismatches +in multi-GPU communication and computation. + This paper provides a landscape of GPU-centric communication, focusing on +vendor mechanisms and user-level library supports. It aims to clarify the +complexities and diverse options in this field, define the terminology, and +categorize existing approaches within and across nodes. The paper discusses +vendor-provided mechanisms for communication and memory management in multi-GPU +execution and reviews major communication libraries, their benefits, +challenges, and performance insights. Then, it explores key research paradigms, +future outlooks, and open research questions. By extensively describing +GPU-centric communication techniques across the software and hardware stacks, +we provide researchers, programmers, engineers, and library designers insights +on how to exploit multi-GPU systems at their best. + +
+
+
+
+
+ + ☆ A Global Perspective on the Past, Present, and Future of Video Streaming + over Starlink + + +
+ This study presents the first global analysis of on-demand video streaming +over Low Earth Orbit (LEO) satellite networks, using data from over one million +households across 85 countries. We highlight Starlink's role as a major LEO +provider, enhancing connectivity in underserved regions. Our findings reveal +that while overall video quality on Starlink matches that of traditional +networks, the inherent variability in LEO conditions -- such as throughput +fluctuations and packet loss -- leads to an increase in bitrate switches and +rebuffers. To further improve the quality of experience for the LEO community, +we manipulate existing congestion control and adaptive bitrate streaming +algorithms using simulation and real A/B tests deployed on over one million +households. Our results underscore the need for video streaming and congestion +control algorithms to adapt to rapidly evolving network landscapes, ensuring +high-quality service across diverse and dynamic network types. + +
+
+
+
+
+
+
+
+ + Operation Systems 1 + +
+
+
+ + ☆ BULKHEAD: Secure, Scalable, and Efficient Kernel Compartmentalization + with PKS NDSS'25 + + +
+ The endless stream of vulnerabilities urgently calls for principled +mitigation to confine the effect of exploitation. However, the monolithic +architecture of commodity OS kernels, like the Linux kernel, allows an attacker +to compromise the entire system by exploiting a vulnerability in any kernel +component. Kernel compartmentalization is a promising approach that follows the +least-privilege principle. However, existing mechanisms struggle with the +trade-off on security, scalability, and performance, given the challenges +stemming from mutual untrustworthiness among numerous and complex components. + In this paper, we present BULKHEAD, a secure, scalable, and efficient kernel +compartmentalization technique that offers bi-directional isolation for +unlimited compartments. It leverages Intel's new hardware feature PKS to +isolate data and code into mutually untrusted compartments and benefits from +its fast compartment switching. With untrust in mind, BULKHEAD introduces a +lightweight in-kernel monitor that enforces multiple important security +invariants, including data integrity, execute-only memory, and compartment +interface integrity. In addition, it provides a locality-aware two-level scheme +that scales to unlimited compartments. We implement a prototype system on Linux +v6.1 to compartmentalize loadable kernel modules (LKMs). Extensive evaluation +confirms the effectiveness of our approach. As the system-wide impacts, +BULKHEAD incurs an average performance overhead of 2.44% for real-world +applications with 160 compartmentalized LKMs. While focusing on a specific +compartment, ApacheBench tests on ipv6 show an overhead of less than 2%. +Moreover, the performance is almost unaffected by the number of compartments, +which makes it highly scalable. + +
+
+ comment: Accepted to appear in NDSS'25 +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 3 + +
+
+
+ + ☆ Well-Behaved (Co)algebraic Semantics of Regular Expressions in Dafny + + +
+ Regular expressions are commonly understood in terms of their denotational +semantics, that is, through formal languages -- the regular languages. This +view is inductive in nature: two primitives are equivalent if they are +constructed in the same way. Alternatively, regular expressions can be +understood in terms of their operational semantics, that is, through +deterministic finite automata. This view is coinductive in nature: two +primitives are equivalent if they are deconstructed in the same way. It is +implied by Kleene's famous theorem that both views are equivalent: regular +languages are precisely the formal languages accepted by deterministic finite +automata. In this paper, we use Dafny, a verification-aware programming +language, to formally verify, for the first time, what has been previously +established only through proofs-by-hand: the two semantics of regular +expressions are well-behaved, in the sense that they are in fact one and the +same, up to pointwise bisimilarity. At each step of our formalisation, we +propose an interpretation in the language of Coalgebra. We found that Dafny is +particularly well suited for the task due to its inductive and coinductive +features and hope our approach serves as a blueprint for future generalisations +to other theories. + +
+
+
+
+
+ + ☆ Risk-Aware Autonomous Driving for Linear Temporal Logic Specifications + + +
+ Decision-making for autonomous driving incorporating different types of risks +is a challenging topic. This paper proposes a novel risk metric to facilitate +the driving task specified by linear temporal logic (LTL) by balancing the risk +brought up by different uncertain events. Such a balance is achieved by +discounting the costs of these uncertain events according to their timing and +severity, thereby reflecting a human-like awareness of risk. We have +established a connection between this risk metric and the occupation measure, a +fundamental concept in stochastic reachability problems, such that a risk-aware +control synthesis problem under LTL specifications is formulated for autonomous +vehicles using occupation measures. As a result, the synthesized policy +achieves balanced decisions across different types of risks with associated +costs, showcasing advantageous versatility and generalizability. The +effectiveness and scalability of the proposed approach are validated by three +typical traffic scenarios in Carla simulator. + +
+
+
+
+
+ + ♻ ☆ From Width-Based Model Checking to Width-Based Automated Theorem Proving + + +
+ In the field of parameterized complexity theory, the study of graph width +measures has been intimately connected with the development of width-based +model checking algorithms for combinatorial properties on graphs. In this work, +we introduce a general framework to convert a large class of width-based +model-checking algorithms into algorithms that can be used to test the validity +of graph-theoretic conjectures on classes of graphs of bounded width. Our +framework is modular and can be applied with respect to several well-studied +width measures for graphs, including treewidth and cliquewidth. + As a quantitative application of our framework, we prove analytically that +for several long-standing graph-theoretic conjectures, there exists an +algorithm that takes a number $k$ as input and correctly determines in time +double-exponential in $k^{O(1)}$ whether the conjecture is valid on all graphs +of treewidth at most $k$. These upper bounds, which may be regarded as +upper-bounds on the size of proofs/disproofs for these conjectures on the class +of graphs of treewidth at most $k$, improve significantly on theoretical upper +bounds obtained using previously available techniques. + +
+
+ comment: A preliminary version of this work was published in the proceedings + of AAAI 2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Performance Profiling 2 + +
+
+
+ + ♻ ☆ AI-driven Java Performance Testing: Balancing Result Quality with + Testing Time + + +
+ Performance testing aims at uncovering efficiency issues of software systems. +In order to be both effective and practical, the design of a performance test +must achieve a reasonable trade-off between result quality and testing time. +This becomes particularly challenging in Java context, where the software +undergoes a warm-up phase of execution, due to just-in-time compilation. During +this phase, performance measurements are subject to severe fluctuations, which +may adversely affect quality of performance test results. However, these +approaches often provide suboptimal estimates of the warm-up phase, resulting +in either insufficient or excessive warm-up iterations, which may degrade +result quality or increase testing time. There is still a lack of consensus on +how to properly address this problem. Here, we propose and study an AI-based +framework to dynamically halt warm-up iterations at runtime. Specifically, our +framework leverages recent advances in AI for Time Series Classification (TSC) +to predict the end of the warm-up phase during test execution. We conduct +experiments by training three different TSC models on half a million of +measurement segments obtained from JMH microbenchmark executions. We find that +our framework significantly improves the accuracy of the warm-up estimates +provided by state-of-practice and state-of-the-art methods. This higher +estimation accuracy results in a net improvement in either result quality or +testing time for up to +35.3% of the microbenchmarks. Our study highlights that +integrating AI to dynamically estimate the end of the warm-up phase can enhance +the cost-effectiveness of Java performance testing. + +
+
+ comment: Accepted for publication in The 39th IEEE/ACM International + Conference on Automated Software Engineering (ASE '24) +
+
+
+
+
+ + ♻ ☆ SCAR: Scheduling Multi-Model AI Workloads on Heterogeneous Multi-Chiplet + Module Accelerators MICRO'24 + + +
+ Emerging multi-model workloads with heavy models like recent large language +models significantly increased the compute and memory demands on hardware. To +address such increasing demands, designing a scalable hardware architecture +became a key problem. Among recent solutions, the 2.5D silicon interposer +multi-chip module (MCM)-based AI accelerator has been actively explored as a +promising scalable solution due to their significant benefits in the low +engineering cost and composability. However, previous MCM accelerators are +based on homogeneous architectures with fixed dataflow, which encounter major +challenges from highly heterogeneous multi-model workloads due to their limited +workload adaptivity. Therefore, in this work, we explore the opportunity in the +heterogeneous dataflow MCM AI accelerators. We identify the scheduling of +multi-model workload on heterogeneous dataflow MCM AI accelerator is an +important and challenging problem due to its significance and scale, which +reaches O(10^56) even for a two-model workload on 6x6 chiplets. We develop a +set of heuristics to navigate the huge scheduling space and codify them into a +scheduler, SCAR, with advanced techniques such as inter-chiplet pipelining. Our +evaluation on ten multi-model workload scenarios for datacenter multitenancy +and AR/VR use-cases has shown the efficacy of our approach, achieving on +average 27.6% and 29.6% less energy-delay product (EDP) for the respective +applications settings compared to homogeneous baselines. + +
+
+ comment: MICRO'24 +
+
+
+
+
+
+
+
+ + Hardware Architecturea 1 + +
+
+
+ + ♻ ☆ SCAR: Scheduling Multi-Model AI Workloads on Heterogeneous Multi-Chiplet + Module Accelerators MICRO'24 + + +
+ Emerging multi-model workloads with heavy models like recent large language +models significantly increased the compute and memory demands on hardware. To +address such increasing demands, designing a scalable hardware architecture +became a key problem. Among recent solutions, the 2.5D silicon interposer +multi-chip module (MCM)-based AI accelerator has been actively explored as a +promising scalable solution due to their significant benefits in the low +engineering cost and composability. However, previous MCM accelerators are +based on homogeneous architectures with fixed dataflow, which encounter major +challenges from highly heterogeneous multi-model workloads due to their limited +workload adaptivity. Therefore, in this work, we explore the opportunity in the +heterogeneous dataflow MCM AI accelerators. We identify the scheduling of +multi-model workload on heterogeneous dataflow MCM AI accelerator is an +important and challenging problem due to its significance and scale, which +reaches O(10^56) even for a two-model workload on 6x6 chiplets. We develop a +set of heuristics to navigate the huge scheduling space and codify them into a +scheduler, SCAR, with advanced techniques such as inter-chiplet pipelining. Our +evaluation on ten multi-model workload scenarios for datacenter multitenancy +and AR/VR use-cases has shown the efficacy of our approach, achieving on +average 27.6% and 29.6% less energy-delay product (EDP) for the respective +applications settings compared to homogeneous baselines. + +
+
+ comment: MICRO'24 +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 6 + +
+
+
+ + ☆ Weather Prediction Using CNN-LSTM for Time Series Analysis: A Case Study + on Delhi Temperature Data + + +
+ As global climate change intensifies, accurate weather forecasting is +increasingly crucial for sectors such as agriculture, energy management, and +environmental protection. Traditional methods, which rely on physical and +statistical models, often struggle with complex, nonlinear, and time-varying +data, underscoring the need for more advanced techniques. This study explores a +hybrid CNN-LSTM model to enhance temperature forecasting accuracy for the Delhi +region, using historical meteorological data from 1996 to 2017. We employed +both direct and indirect methods, including comprehensive data preprocessing +and exploratory analysis, to construct and train our model. The CNN component +effectively extracts spatial features, while the LSTM captures temporal +dependencies, leading to improved prediction accuracy. Experimental results +indicate that the CNN-LSTM model significantly outperforms traditional +forecasting methods in terms of both accuracy and stability, with a mean square +error (MSE) of 3.26217 and a root mean square error (RMSE) of 1.80615. The +hybrid model demonstrates its potential as a robust tool for temperature +prediction, offering valuable insights for meteorological forecasting and +related fields. Future research should focus on optimizing model architecture, +exploring additional feature extraction techniques, and addressing challenges +such as overfitting and computational complexity. This approach not only +advances temperature forecasting but also provides a foundation for applying +deep learning to other time series forecasting tasks. + +
+
+
+
+
+ + ☆ Developing an Interactive OpenMP Programming Book with Large Language + Models + + +
+ This paper presents an approach to authoring a textbook titled Interactive +OpenMP Programming with the assistance of Large Language Models (LLMs). The +writing process utilized state-of-the-art LLMs, including Gemini Pro 1.5, +Claude 3, and ChatGPT-4, to generate the initial structure and outline of the +book, as well as the initial content for specific chapters. This content +included detailed descriptions of individual OpenMP constructs and practical +programming examples. The outline and content have then undergone extensive +manual revisions to meet our book goals. In this paper, we report our findings +about the capabilities and limitations of these LLMs. We address critical +questions concerning the necessity of textbook resources and the effectiveness +of LLMs in creating fundamental and practical programming content. Our findings +suggest that while LLMs offer significant advantages in generating textbook +content, they require careful integration with traditional educational +methodologies to ensure depth, accuracy, and pedagogical effectiveness. The +Interactive OpenMP Programming book is developed with the framework of Jupyter +Book, enabling the execution of code within the book from the web browser, +providing instant feedback and a dynamic learning experience that stands in +contrast to traditional educational resources. The book represents a +significant step towards modernizing programming education, offering insights +into practical strategies for generating the textbook through advanced AI +tools. + +
+
+
+
+
+ + ☆ Leveraging Foundation Models for Efficient Federated Learning in + Resource-restricted Edge Networks + + +
+ Recently pre-trained Foundation Models (FMs) have been combined with +Federated Learning (FL) to improve training of downstream tasks while +preserving privacy. However, deploying FMs over edge networks with +resource-constrained Internet of Things (IoT) devices is under-explored. This +paper proposes a novel framework, namely, Federated Distilling knowledge to +Prompt (FedD2P), for leveraging the robust representation abilities of a +vision-language FM without deploying it locally on edge devices. This framework +distills the aggregated knowledge of IoT devices to a prompt generator to +efficiently adapt the frozen FM for downstream tasks. To eliminate the +dependency on a public dataset, our framework leverages perclass local +knowledge from IoT devices and linguistic descriptions of classes to train the +prompt generator. Our experiments on diverse image classification datasets +CIFAR, OxfordPets, SVHN, EuroSAT, and DTD show that FedD2P outperforms the +baselines in terms of model performance. + +
+
+
+
+
+ + ☆ A Dynamic Weighting Strategy to Mitigate Worker Node Failure in + Distributed Deep Learning + + +
+ The increasing complexity of deep learning models and the demand for +processing vast amounts of data make the utilization of large-scale distributed +systems for efficient training essential. These systems, however, face +significant challenges such as communication overhead, hardware limitations, +and node failure. This paper investigates various optimization techniques in +distributed deep learning, including Elastic Averaging SGD (EASGD) and the +second-order method AdaHessian. We propose a dynamic weighting strategy to +mitigate the problem of straggler nodes due to failure, enhancing the +performance and efficiency of the overall training process. We conduct +experiments with different numbers of workers and communication periods to +demonstrate improved convergence rates and test performance using our strategy. + +
+
+
+
+
+ + ♻ ☆ SCAR: Scheduling Multi-Model AI Workloads on Heterogeneous Multi-Chiplet + Module Accelerators MICRO'24 + + +
+ Emerging multi-model workloads with heavy models like recent large language +models significantly increased the compute and memory demands on hardware. To +address such increasing demands, designing a scalable hardware architecture +became a key problem. Among recent solutions, the 2.5D silicon interposer +multi-chip module (MCM)-based AI accelerator has been actively explored as a +promising scalable solution due to their significant benefits in the low +engineering cost and composability. However, previous MCM accelerators are +based on homogeneous architectures with fixed dataflow, which encounter major +challenges from highly heterogeneous multi-model workloads due to their limited +workload adaptivity. Therefore, in this work, we explore the opportunity in the +heterogeneous dataflow MCM AI accelerators. We identify the scheduling of +multi-model workload on heterogeneous dataflow MCM AI accelerator is an +important and challenging problem due to its significance and scale, which +reaches O(10^56) even for a two-model workload on 6x6 chiplets. We develop a +set of heuristics to navigate the huge scheduling space and codify them into a +scheduler, SCAR, with advanced techniques such as inter-chiplet pipelining. Our +evaluation on ten multi-model workload scenarios for datacenter multitenancy +and AR/VR use-cases has shown the efficacy of our approach, achieving on +average 27.6% and 29.6% less energy-delay product (EDP) for the respective +applications settings compared to homogeneous baselines. + +
+
+ comment: MICRO'24 +
+
+
+
+
+ + ♻ ☆ Experiments of posture estimation on vehicles using wearable + acceleration sensors + + +
+ In this paper, we study methods to estimate drivers' posture in vehicles +using acceleration data of wearable sensor and conduct a field test. Recently, +sensor technologies have been progressed. Solutions of safety management to +analyze vital data acquired from wearable sensor and judge work status are +proposed. To prevent huge accidents, demands for safety management of bus and +taxi are high. However, acceleration of vehicles is added to wearable sensor in +vehicles, and there is no guarantee to estimate drivers' posture accurately. +Therefore, in this paper, we study methods to estimate driving posture using +acceleration data acquired from T-shirt type wearable sensor hitoe, conduct +field tests and implement a sample application. + Y. Yamato, "Experiments of Posture Estimation on Vehicles Using Wearable +Acceleration Sensors," The 3rd IEEE International Conference on Big Data +Security on Cloud (BigDataSecurity 2017), pp.14-17, DOI: +10.1109/BigDataSecurity.2017.8, May 2017. + "(c) 2017 IEEE. Personal use of this material is permitted. Permission from +IEEE must be obtained for all other uses, in any current or future media, +including reprinting/republishing this material for advertising or promotional +purposes, creating new collective works, for resale or redistribution to +servers or lists, or reuse of any copyrighted component of this work in other +works." + +
+
+ comment: 4 pages, 4 figures, The 3rd IEEE International Conference on Big Data + Security on Cloud (BigDataSecurity 2017), pp.14-17, Beijing, May 2017 +
+
+
+
+
+
+
+
+ + Programming and Languages 1 + +
+
+
+ + ☆ Python Symbolic Execution with LLM-powered Code Generation + + +
+ Symbolic execution is a key technology in software testing, which generates +test cases by collecting symbolic path constraints and then solving constraints +with SMT solvers. Symbolic execution has been proven helpful in generating +high-coverage test cases, but its limitations, e.g., the difficulties in +solving path constraints, prevent it from broader usage in software testing. +Moreover, symbolic execution has encountered many difficulties when applied to +dynamically typed languages like Python, because it is extremely challenging to +translate the flexible Python grammar into rigid solvers. + To overcome the main challenges of applying symbolic execution in Python, we +proposed an LLM-empowered agent, LLM-Sym, that automatically calls an SMT +solver, Z3, to solve execution path constraints. Based on an introductory-level +symbolic execution engine, our LLM agent can extend it to supporting programs +with complex data type `list'. The core contribution of LLM-Sym is translating +complex Python path constraints into Z3 code. To enable accurate path-to-Z3 +translation, we design a multiple-step code generation pipeline including type +inference, retrieval and self-refine. Our experiments demonstrate that LLM-Sym +is capable of solving path constraints on Leetcode problems with complicated +control flows and list data structures, which is impossible for the backbone +symbolic execution engine. Our approach paves the way for the combination of +the generation ability of LLMs with the reasoning ability of symbolic solvers, +and opens up new opportunities in LLM-augmented test case generation. + +
+
+
+
+
+
+
+
+ + Computational Complexity 3 + +
+
+
+ + ♻ ☆ Orbit-blocking words and the average-case complexity of Whitehead's + problem in the free group of rank 2 + + +
+ Let F_2 denote the free group of rank 2. Our main technical result of +independent interest is: for any element u of F_2, there is g in F_2 such that +no cyclically reduced image of u under an automorphism of F_2 contains g as a +subword. We then address computational complexity of the following version of +the Whitehead automorphism problem: given a fixed u in F_2, decide, on an input +v in F_2 of length n, whether or not v is an automorphic image of u. We show +that there is an algorithm that solves this problem and has constant (i.e., +independent of n) average-case complexity. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ Hive-type polytopes for quiver multiplicities and the membership problem + for quiver moment cones + + +
+ Let $Q$ be a bipartite quiver with vertex set $Q_0$ such that the number of +arrows between any source vertex and any sink vertex is constant. Let +$\beta=(\beta(x))_{x \in Q_0}$ be a dimension vector of $Q$ with positive +integer coordinates. + Let $rep(Q, \beta)$ be the representation space of $\beta$-dimensional +representations of $Q$ and $GL(\beta)$ the base change group acting on $rep(Q, +\beta)$ be simultaneous conjugation. Let $K^{\beta}_{\underline{\lambda}}$ be +the multiplicity of the irreducible representation of $GL(\beta)$ of highest +weight $\underline{\lambda}$ in the ring of polynomial functions on $rep(Q, +\beta)$. + We show that $K^{\beta}_{\underline{\lambda}}$ can be expressed as the number +of lattice points of a polytope obtained by gluing together two Knutson-Tao +hive polytopes. Furthermore, this polytopal description together with +Derksen-Weyman's Saturation Theorem for quiver semi-invariants allows us to use +Tardos' algorithm to solve the membership problem for the moment cone +associated to $(Q,\beta)$ in strongly polynomial time. + +
+
+ comment: v2: Fixed the claim about the generic quiver semi-stability problem + (see Remarks 2.8 and 5.5); v3: Final version to appear in Algebraic + Combinatorics. The focus is on polytopal descriptions of multiplicities of + irreducible representations of $GL(\beta)$ in the ring of polynomial + functions on $rep(Q, \beta)$ +
+
+
+
+
+ + ♻ ☆ Orbit-blocking words and the average-case complexity of Whitehead's + problem in the free group of rank 2 + + +
+ Let F_2 denote the free group of rank 2. Our main technical result of +independent interest is: for any element u of F_2, there is g in F_2 such that +no cyclically reduced image of u under an automorphism of F_2 contains g as a +subword. We then address computational complexity of the following version of +the Whitehead automorphism problem: given a fixed u in F_2, decide, on an input +v in F_2 of length n, whether or not v is an automorphic image of u. We show +that there is an algorithm that solves this problem and has constant (i.e., +independent of n) average-case complexity. + +
+
+ comment: 6 pages. arXiv admin note: text overlap with arXiv:2401.09218 +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 1 + +
+
+
+ + ♻ ☆ Normal Forms for Elements of ${}^*$-Continuous Kleene Algebras + Representing the Context-Free Languages + + +
+ Within the tensor product $K \mathop{\otimes_{\cal R}} C_2'$ of any +${}^*$-continuous Kleene algebra $K$ with the polycyclic ${}^*$-continuous +Kleene algebra $C_2'$ over two bracket pairs there is a copy of the fixed-point +closure of $K$: the centralizer of $C_2'$ in $K \mathop{\otimes_{\cal R}} +C_2'$. Using an automata-theoretic representation of elements of +$K\mathop{\otimes_{\cal R}} C_2'$ \`a la Kleene, with the aid of normal form +theorems that restrict the occurrences of brackets on paths through the +automata, we develop a foundation for a calculus of context-free expressions +without variable binders. We also give some results on the bra-ket +${}^*$-continuous Kleene algebra $C_2$, motivate the ``completeness equation'' +that distinguishes $C_2$ from $C_2'$, and show that $C_2'$ already validates a +relativized form of this equation. + +
+
+ comment: Revised version. 43 pages, 4 figures +
+
+
+
+
+
+
+
+ + Logic in Computer Science 1 + +
+
+
+ + ☆ Enumerating Minimal Unsatisfiable Cores of LTLf formulas + + +
+ Linear Temporal Logic over finite traces ($\text{LTL}_f$) is a widely used +formalism with applications in AI, process mining, model checking, and more. +The primary reasoning task for $\text{LTL}_f$ is satisfiability checking; yet, +the recent focus on explainable AI has increased interest in analyzing +inconsistent formulas, making the enumeration of minimal explanations for +infeasibility a relevant task also for $\text{LTL}_f$. This paper introduces a +novel technique for enumerating minimal unsatisfiable cores (MUCs) of an +$\text{LTL}_f$ specification. The main idea is to encode a $\text{LTL}_f$ +formula into an Answer Set Programming (ASP) specification, such that the +minimal unsatisfiable subsets (MUSes) of the ASP program directly correspond to +the MUCs of the original $\text{LTL}_f$ specification. Leveraging recent +advancements in ASP solving yields a MUC enumerator achieving good performance +in experiments conducted on established benchmarks from the literature. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Hardware Architecturea 6 + +
+
+
+ + ☆ Generic and ML Workloads in an HPC Datacenter: Node Energy, Job + Failures, and Node-Job Analysis + + +
+ HPC datacenters offer a backbone to the modern digital society. Increasingly, +they run Machine Learning (ML) jobs next to generic, compute-intensive +workloads, supporting science, business, and other decision-making processes. +However, understanding how ML jobs impact the operation of HPC datacenters, +relative to generic jobs, remains desirable but understudied. In this work, we +leverage long-term operational data, collected from a national-scale production +HPC datacenter, and statistically compare how ML and generic jobs can impact +the performance, failures, resource utilization, and energy consumption of HPC +datacenters. Our study provides key insights, e.g., ML-related power usage +causes GPU nodes to run into temperature limitations, median/mean runtime and +failure rates are higher for ML jobs than for generic jobs, both ML and generic +jobs exhibit highly variable arrival processes and resource demands, +significant amounts of energy are spent on unsuccessfully terminating jobs, and +concurrent jobs tend to terminate in the same state. We open-source our +cleaned-up data traces on Zenodo (https://doi.org/10.5281/zenodo.13685426), and +provide our analysis toolkit as software hosted on GitHub +(https://github.com/atlarge-research/2024-icpads-hpc-workload-characterization). +This study offers multiple benefits for data center administrators, who can +improve operational efficiency, and for researchers, who can further improve +system designs, scheduling techniques, etc. + +
+
+ comment: 10 pages, 10 figures, 6 tables, ICPADS 2024 +
+
+
+
+
+ + ☆ Automatic Generation of Fast and Accurate Performance Models for Deep + Neural Network Accelerators + + +
+ Implementing Deep Neural Networks (DNNs) on resource-constrained edge devices +is a challenging task that requires tailored hardware accelerator architectures +and a clear understanding of their performance characteristics when executing +the intended AI workload. To facilitate this, we present an automated +generation approach for fast performance models to accurately estimate the +latency of a DNN mapped onto systematically modeled and concisely described +accelerator architectures. Using our accelerator architecture description +method, we modeled representative DNN accelerators such as Gemmini, UltraTrail, +Plasticine-derived, and a parameterizable systolic array. Together with DNN +mappings for those modeled architectures, we perform a combined DNN/hardware +dependency graph analysis, which enables us, in the best case, to evaluate only +154 loop kernel iterations to estimate the performance for 4.19 billion +instructions achieving a significant speedup. We outperform regression and +analytical models in terms of mean absolute percentage error (MAPE) compared to +simulation results, while being several magnitudes faster than an RTL +simulation. + +
+
+ comment: Accepted version for: ACM Transactions on Embedded Computing Systems +
+
+
+
+
+ + ☆ AnalogGym: An Open and Practical Testing Suite for Analog Circuit + Synthesis + + +
+ Recent advances in machine learning (ML) for automating analog circuit +synthesis have been significant, yet challenges remain. A critical gap is the +lack of a standardized evaluation framework, compounded by various process +design kits (PDKs), simulation tools, and a limited variety of circuit +topologies. These factors hinder direct comparisons and the validation of +algorithms. To address these shortcomings, we introduced AnalogGym, an +open-source testing suite designed to provide fair and comprehensive +evaluations. AnalogGym includes 30 circuit topologies in five categories: +sensing front ends, voltage references, low dropout regulators, amplifiers, and +phase-locked loops. It supports several technology nodes for academic and +commercial applications and is compatible with commercial simulators such as +Cadence Spectre, Synopsys HSPICE, and the open-source simulator Ngspice. +AnalogGym standardizes the assessment of ML algorithms in analog circuit +synthesis and promotes reproducibility with its open datasets and detailed +benchmark specifications. AnalogGym's user-friendly design allows researchers +to easily adapt it for robust, transparent comparisons of state-of-the-art +methods, while also exposing them to real-world industrial design challenges, +enhancing the practical relevance of their work. Additionally, we have +conducted a comprehensive comparison study of various analog sizing methods on +AnalogGym, highlighting the capabilities and advantages of different +approaches. AnalogGym is available in the GitHub repository +https://github.com/CODA-Team/AnalogGym. The documentation is also available at +http://coda-team.github.io/AnalogGym/. + +
+
+
+
+
+ + ☆ Distributed Binary Optimization with In-Memory Computing: An Application + for the SAT Problem + + +
+ In-memory computing (IMC) has been shown to be a promising approach for +solving binary optimization problems while significantly reducing energy and +latency. Building on the advantages of parallel computation, we propose an +IMC-compatible parallelism framework inspired by parallel tempering (PT), +enabling cross-replica communication to improve the performance of IMC solvers. +This framework enables an IMC solver not only to improve performance beyond +what can be achieved through parallelization, but also affords greater +flexibility for the search process with low hardware overhead. We justify that +the framework can be applied to almost any IMC solver. We demonstrate the +effectiveness of the framework for the Boolean satisfiability (SAT) problem, +using the WalkSAT heuristic as a proxy for existing IMC solvers. The resulting +PT-inspired cooperative WalkSAT (PTIC-WalkSAT) algorithm outperforms the +traditional WalkSAT heuristic in terms of the iterations-to-solution in 76.3% +of the tested problem instances and its na\"ive parallel variant (PA-WalkSAT) +does so in 68.4% of the instances. An estimate of the energy overhead of the +PTIC framework for two hardware accelerator architectures indicates that in +both cases the overhead of running the PTIC framework would be less than 1% of +the total energy required to run each accelerator. + +
+
+ comment: 21 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Dynamic Simultaneous Multithreaded Architecture + + +
+ This paper presents the Dynamic Simultaneous Multi-threaded Architecture +(DSMT). DSMT efficiently exe-cutes multiple threads from a single program on a +SMT processor core. To accomplish this, threads are generated dynamically from +a predictable flow of control and then executed speculatively. Data obtained +during the single context non-speculative execution phase of DSMT is used as a +hint to speculate the posterior behavior of multiple threads. DSMT employs +simple mechanisms based on state bits that keep track of inter-thread +dependencies in registers and memory, synchronize thread execution, and control +recovery from misspeculation. Moreover, DSMT utilizes a novel greedy policy for +choosing those sections of code which provide the highest performance based on +their past execution history. The DSMT architecture was simulated with a new +cycle-accurate, execution-driven simulator. Our simulation results show that +DSMT has very good potential to improve SMT performance, even when only a +single program is available. However, we found that dynamic thread behavior +together with fre-quent misspeculation may also produce diminishing re-turns in +performance. Therefore, the challenge is to max-imize the amount of +thread-level parallelism that DSMT is capable of exploiting and at the same +time reduce the fre-quency of misspeculations. + +
+
+
+
+
+ + ♻ ☆ Hardware-Assisted Virtualization of Neural Processing Units for Cloud + Platforms MICRO'24 + + +
+ Cloud platforms today have been deploying hardware accelerators like neural +processing units (NPUs) for powering machine learning (ML) inference services. +To maximize the resource utilization while ensuring reasonable quality of +service, a natural approach is to virtualize NPUs for efficient resource +sharing for multi-tenant ML services. However, virtualizing NPUs for modern +cloud platforms is not easy. This is not only due to the lack of system +abstraction support for NPU hardware, but also due to the lack of architectural +and ISA support for enabling fine-grained dynamic operator scheduling for +virtualized NPUs. + We present Neu10, a holistic NPU virtualization framework. We investigate +virtualization techniques for NPUs across the entire software and hardware +stack. Neu10 consists of (1) a flexible NPU abstraction called vNPU, which +enables fine-grained virtualization of the heterogeneous compute units in a +physical NPU (pNPU); (2) a vNPU resource allocator that enables pay-as-you-go +computing model and flexible vNPU-to-pNPU mappings for improved resource +utilization and cost-effectiveness; (3) an ISA extension of modern NPU +architecture for facilitating fine-grained tensor operator scheduling for +multiple vNPUs. We implement Neu10 based on a production-level NPU simulator. +Our experiments show that Neu10 improves the throughput of ML inference +services by up to 1.4$\times$ and reduces the tail latency by up to +4.6$\times$, while improving the NPU utilization by 1.2$\times$ on average, +compared to state-of-the-art NPU sharing approaches. + +
+
+ comment: Accepted to MICRO'24 +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 10 + +
+
+
+ + ☆ Generic and ML Workloads in an HPC Datacenter: Node Energy, Job + Failures, and Node-Job Analysis + + +
+ HPC datacenters offer a backbone to the modern digital society. Increasingly, +they run Machine Learning (ML) jobs next to generic, compute-intensive +workloads, supporting science, business, and other decision-making processes. +However, understanding how ML jobs impact the operation of HPC datacenters, +relative to generic jobs, remains desirable but understudied. In this work, we +leverage long-term operational data, collected from a national-scale production +HPC datacenter, and statistically compare how ML and generic jobs can impact +the performance, failures, resource utilization, and energy consumption of HPC +datacenters. Our study provides key insights, e.g., ML-related power usage +causes GPU nodes to run into temperature limitations, median/mean runtime and +failure rates are higher for ML jobs than for generic jobs, both ML and generic +jobs exhibit highly variable arrival processes and resource demands, +significant amounts of energy are spent on unsuccessfully terminating jobs, and +concurrent jobs tend to terminate in the same state. We open-source our +cleaned-up data traces on Zenodo (https://doi.org/10.5281/zenodo.13685426), and +provide our analysis toolkit as software hosted on GitHub +(https://github.com/atlarge-research/2024-icpads-hpc-workload-characterization). +This study offers multiple benefits for data center administrators, who can +improve operational efficiency, and for researchers, who can further improve +system designs, scheduling techniques, etc. + +
+
+ comment: 10 pages, 10 figures, 6 tables, ICPADS 2024 +
+
+
+
+
+ + ☆ Exploring System-Heterogeneous Federated Learning with Dynamic Model + Selection + + +
+ Federated learning is a distributed learning paradigm in which multiple +mobile clients train a global model while keeping data local. These mobile +clients can have various available memory and network bandwidth. However, to +achieve the best global model performance, how we can utilize available memory +and network bandwidth to the maximum remains an open challenge. In this paper, +we propose to assign each client a subset of the global model, having different +layers and channels on each layer. To realize that, we design a constrained +model search process with early stop to improve efficiency of finding the +models from such a very large space; and a data-free knowledge distillation +mechanism to improve the global model performance when aggregating models of +such different structures. For fair and reproducible comparison between +different solutions, we develop a new system, which can directly allocate +different memory and bandwidth to each client according to memory and bandwidth +logs collected on mobile devices. The evaluation shows that our solution can +have accuracy increase ranging from 2.43\% to 15.81\% and provide 5\% to 40\% +more memory and bandwidth utilization with negligible extra running time, +comparing to existing state-of-the-art system-heterogeneous federated learning +methods under different available memory and bandwidth, non-i.i.d.~datasets, +image and text tasks. + +
+
+
+
+
+ + ☆ Accurate Computation of the Logarithm of Modified Bessel Functions on + GPUs + + +
+ Bessel functions are critical in scientific computing for applications such +as machine learning, protein structure modeling, and robotics. However, +currently, available routines lack precision or fail for certain input ranges, +such as when the order $v$ is large, and GPU-specific implementations are +limited. We address the precision limitations of current numerical +implementations while dramatically improving the runtime. We propose two novel +algorithms for computing the logarithm of modified Bessel functions of the +first and second kinds by computing intermediate values on a logarithmic scale. +Our algorithms are robust and never have issues with underflows or overflows +while having relative errors on the order of machine precision, even for inputs +where existing libraries fail. In C++/CUDA, our algorithms have median and +maximum speedups of 45x and 6150x for GPU and 17x and 3403x for CPU, +respectively, over the ranges of inputs and third-party libraries tested. +Compared to SciPy, the algorithms have median and maximum speedups of 77x and +300x for GPU and 35x and 98x for CPU, respectively, over the tested inputs. + The ability to robustly compute a solution and the low relative errors allow +us to fit von Mises-Fisher, vMF, distributions to high-dimensional neural +network features. This is, e.g., relevant for uncertainty quantification in +metric learning. We obtain image feature data by processing CIFAR10 training +images with the convolutional layers of a pre-trained ResNet50. We successfully +fit vMF distributions to 2048-, 8192-, and 32768-dimensional image feature data +using our algorithms. Our approach provides fast and accurate results while +existing implementations in SciPy and mpmath fail to fit successfully. + Our approach is readily implementable on GPUs, and we provide a fast +open-source implementation alongside this paper. + +
+
+ comment: Accepted at ICS 2024 +
+
+
+
+
+ + ☆ Byzantine-Robust and Communication-Efficient Distributed Learning via + Compressed Momentum Filtering + + +
+ Distributed learning has become the standard approach for training +large-scale machine learning models across private data silos. While +distributed learning enhances privacy preservation and training efficiency, it +faces critical challenges related to Byzantine robustness and communication +reduction. Existing Byzantine-robust and communication-efficient methods rely +on full gradient information either at every iteration or at certain iterations +with a probability, and they only converge to an unnecessarily large +neighborhood around the solution. Motivated by these issues, we propose a novel +Byzantine-robust and communication-efficient stochastic distributed learning +method that imposes no requirements on batch size and converges to a smaller +neighborhood around the optimal solution than all existing methods, aligning +with the theoretical lower bound. Our key innovation is leveraging Polyak +Momentum to mitigate the noise caused by both biased compressors and stochastic +gradients, thus defending against Byzantine workers under information +compression. We provide proof of tight complexity bounds for our algorithm in +the context of non-convex smooth loss functions, demonstrating that these +bounds match the lower bounds in Byzantine-free scenarios. Finally, we validate +the practical significance of our algorithm through an extensive series of +experiments, benchmarking its performance on both binary classification and +image classification tasks. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ CompressedMediQ: Hybrid Quantum Machine Learning Pipeline for + High-Dimentional Neuroimaging Data + + +
+ This paper introduces CompressedMediQ, a novel hybrid quantum-classical +machine learning pipeline specifically developed to address the computational +challenges associated with high-dimensional multi-class neuroimaging data +analysis. Standard neuroimaging datasets, such as 4D MRI data from the +Alzheimer's Disease Neuroimaging Initiative (ADNI) and Neuroimaging in +Frontotemporal Dementia (NIFD), present significant hurdles due to their vast +size and complexity. CompressedMediQ integrates classical high-performance +computing (HPC) nodes for advanced MRI pre-processing and Convolutional Neural +Network (CNN)-PCA-based feature extraction and reduction, addressing the +limited-qubit availability for quantum data encoding in the NISQ (Noisy +Intermediate-Scale Quantum) era. This is followed by Quantum Support Vector +Machine (QSVM) classification. By utilizing quantum kernel methods, the +pipeline optimizes feature mapping and classification, enhancing data +separability and outperforming traditional neuroimaging analysis techniques. +Experimental results highlight the pipeline's superior accuracy in dementia +staging, validating the practical use of quantum machine learning in clinical +diagnostics. Despite the limitations of NISQ devices, this proof-of-concept +demonstrates the transformative potential of quantum-enhanced learning, paving +the way for scalable and precise diagnostic tools in healthcare and signal +processing. + +
+
+
+
+
+ + ☆ WarmSwap: Sharing Dependencies for Accelerating Cold Starts in + Serverless Functions + + +
+ This work presents WarmSwap, a novel provider-side cold-start optimization +for serverless computing. This optimization reduces cold-start time when +booting and loading dependencies at runtime inside a function container. +Previous approaches to the optimization of cold starts tend to fall into two +categories: optimizing the infrastructure of serverless computing to benefit +all serverless functions; or function-specific tuning for individual serverless +functions. In contrast, WarmSwap offers a broad middle ground, which optimizes +entire categories of serverless functions. WarmSwap eliminates the need to +initialize middleware or software dependencies when launching a new serverless +container, by migrating a pre-initialized live dependency image to the new +function instance. WarmSwap respects the provider's cache constraints, as a +single pre-warmed dependency image in the cache is shared among all serverless +functions requiring that software dependency image. WarmSwap has been tested on +seven representative functions from FunctionBench. The functions are chosen to +compare with previous work. In those tests, WarmSwap accelerates cold-start +executions for those serverless functions with large dependency requirements by +a factor ranging from 1.2 to 2.2. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Dynamic Simultaneous Multithreaded Architecture + + +
+ This paper presents the Dynamic Simultaneous Multi-threaded Architecture +(DSMT). DSMT efficiently exe-cutes multiple threads from a single program on a +SMT processor core. To accomplish this, threads are generated dynamically from +a predictable flow of control and then executed speculatively. Data obtained +during the single context non-speculative execution phase of DSMT is used as a +hint to speculate the posterior behavior of multiple threads. DSMT employs +simple mechanisms based on state bits that keep track of inter-thread +dependencies in registers and memory, synchronize thread execution, and control +recovery from misspeculation. Moreover, DSMT utilizes a novel greedy policy for +choosing those sections of code which provide the highest performance based on +their past execution history. The DSMT architecture was simulated with a new +cycle-accurate, execution-driven simulator. Our simulation results show that +DSMT has very good potential to improve SMT performance, even when only a +single program is available. However, we found that dynamic thread behavior +together with fre-quent misspeculation may also produce diminishing re-turns in +performance. Therefore, the challenge is to max-imize the amount of +thread-level parallelism that DSMT is capable of exploiting and at the same +time reduce the fre-quency of misspeculations. + +
+
+
+
+
+ + ♻ ☆ Confidential Computing on nVIDIA H100 GPU: A Performance Benchmark Study + + +
+ This report evaluates the performance impact of enabling Trusted Execution +Environments (TEE) on nVIDIA H100 GPUs for large language model (LLM) inference +tasks. We benchmark the overhead introduced by TEE mode across various LLMs and +token lengths, with a particular focus on the bottleneck caused by CPU-GPU data +transfers via PCIe. Our results indicate that while there is minimal +computational overhead within the GPU, the overall performance penalty is +primarily attributable to data transfer. For the majority of typical LLM +queries, the overhead remains below 5%, with larger models and longer sequences +experiencing nearly zero overhead. + +
+
+
+
+
+ + ♻ ☆ BE-RAN: Blockchain-enabled Open RAN for 6G with DID and + Privacy-Preserving Communication + + +
+ As 6G networks evolve towards a synergistic system of Communication, Sensing, +and Computing, Radio Access Networks become more distributed, necessitating +robust end-to-end authentication. We propose Blockchain-enabled Radio Access +Networks, a novel decentralized RAN architecture enhancing security, privacy, +and efficiency in authentication processes. BE-RAN leverages distributed ledger +technology to establish trust, offering user-centric identity management, +enabling mutual authentication, and facilitating on-demand point-to-point +inter-network elements and UE-UE communication with accountable logging and +billing service add-on for public network users, all without relying on +centralized authorities. We envision a thoroughly decentralized RAN model and +propose a privacy-preserving P2P communication approach that complements +existing security measures while supporting the CSC paradigm. Results +demonstrate BE-RAN significantly reduces communication and computation +overheads, enhances privacy through decentralized identity management, and +facilitates CSC integration, advancing towards more efficient and secure 6G +networks. + +
+
+
+
+
+ + ♻ ☆ Proposal of Automatic Offloading Method in Mixed Offloading Destination + Environment + + +
+ When using heterogeneous hardware, barriers of technical skills such as +OpenMP, CUDA and OpenCL are high. Based on that, I have proposed +environment-adaptive software that enables automatic conversion, configuration. +However, including existing technologies, there has been no research to +properly and automatically offload the mixed offloading destination environment +such as GPU, FPGA and many core CPU. In this paper, as a new element of +environment-adaptive software, I study a method for offloading applications +properly and automatically in the environment where the offloading destination +is mixed with GPU, FPGA and many core CPU. + Y. Yamato, "Proposal of Automatic Offloading Method in Mixed Offloading +Destination Environment," 2020 Eighth International Symposium on Computing and +Networking Workshops (CANDARW 2020), pp.460-464, DOI: +10.1109/CANDARW51189.2020.00094, Nov. 2020. + "(c) 2020 IEEE. Personal use of this material is permitted. Permission from +IEEE must be obtained for all other uses, in any current or future media, +including reprinting/republishing this material for advertising or promotional +purposes, creating new collective works, for resale or redistribution to +servers or lists, or reuse of any copyrighted component of this work in other +works." + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+
+
+
+ + Programming and Languages 5 + +
+
+
+ + ♻ ☆ DOCE: Finding the Sweet Spot for Execution-Based Code Generation + + +
+ Recently, a diverse set of decoding and reranking procedures have been shown +effective for LLM-based code generation. However, a comprehensive framework +that links and experimentally compares these methods is missing. We address +this by proposing Decoding Objectives for Code Execution, a comprehensive +framework that includes candidate generation, $n$-best reranking, minimum Bayes +risk (MBR) decoding, and self-debugging as the core components. We then study +the contributions of these components through execution-based evaluation +metrics. Our findings highlight the importance of execution-based methods and +the difference gap between execution-based and execution-free methods. +Furthermore, we assess the impact of filtering based on trial unit tests, a +simple and effective strategy that has been often overlooked in prior works. We +also propose self-debugging on multiple candidates, obtaining state-of-the-art +performance on reranking for code generation. We expect our framework to +provide a solid guideline for future research on code generation. + +
+
+ comment: 10 pages (32 including appendix), 5 figures, 25 tables. Prompts are + provided in the GitHub repository to avoid potential text overlap with other + papers +
+
+
+
+
+ + ♻ ☆ A Joint Learning Model with Variational Interaction for Multilingual + Program Translation + + +
+ Programs implemented in various programming languages form the foundation of +software applications. To alleviate the burden of program migration and +facilitate the development of software systems, automated program translation +across languages has garnered significant attention. Previous approaches +primarily focus on pairwise translation paradigms, learning translation between +pairs of languages using bilingual parallel data. However, parallel data is +difficult to collect for some language pairs, and the distribution of program +semantics across languages can shift, posing challenges for pairwise program +translation. In this paper, we argue that jointly learning a unified model to +translate code across multiple programming languages is superior to separately +learning from bilingual parallel data. We propose Variational Interaction for +Multilingual Program Translation~(VIM-PT), a disentanglement-based generative +approach that jointly trains a unified model for multilingual program +translation across multiple languages. VIM-PT disentangles code into +language-shared and language-specific features, using variational inference and +interaction information with a novel lower bound, then achieves program +translation through conditional generation. VIM-PT demonstrates four +advantages: 1) captures language-shared information more accurately from +various implementations and improves the quality of multilingual program +translation, 2) mines and leverages the capability of non-parallel data, 3) +addresses the distribution shift of program semantics across languages, 4) and +serves as a unified model, reducing deployment complexity. + +
+
+ comment: Accepted by the 39th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2024) +
+
+
+
+
+ + ♻ ☆ CoverUp: Coverage-Guided LLM-Based Test Generation + + +
+ Testing is an essential part of software development. Test generation tools +attempt to automate the otherwise labor-intensive task of test creation, but +generating high-coverage tests remains a challenge. This paper proposes +CoverUp, a novel approach to driving the generation of high-coverage Python +regression tests. CoverUp iteratively improves test coverage, interleaving +coverage analysis with dialogs with the LLM that steer it to refine tests so +that they increase coverage of lines and branches. We evaluate our prototype +CoverUp implementation across a benchmark of challenging code derived from +open-source Python projects, and show that CoverUp substantially improves on +the state of the art. Compared to CodaMosa, a hybrid search/LLM-based test +generator, CoverUp achieves a per-module median line+branch coverage of 80% +(vs. 47%). Compared to MuTAP, a mutation/LLM-based test generator, CoverUp +achieves an overall line+branch coverage of 90% (vs. 77%). We show that +CoverUp's iterative, coverage-guided approach is crucial to its effectiveness, +contributing to nearly 40% of its successes. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ Understanding How CodeLLMs (Mis)Predict Types with Activation Steering + + +
+ CodeLLMs are transforming software development as we know it. This is +especially true for tasks where rule-based approaches fall short, like type +prediction. The type prediction task consists in adding a new type annotation +to a partially typed program, such that the resulting program is closer to +being fully typed. The intractability of rule-based approaches and high cost of +manual annotation make CodeLLMs an attractive solution to the problem. However, +CodeLLMs are still far from being deployed on the large-scale due to doubts +surrounding their reliability. + To shed some light on how CodeLLMs approach type prediction, we investigate +what happens when a model mispredicts a type. We show that by applying +semantics-preserving edits to code, CodeLLMs are eventually misled into +mispredicting type annotations. However, by leveraging activation steering we +are able to "steer" the model back to the correct prediction, making models +more robust against semantically irrelevant prompt features. We show that +steering achieves comparable performance to fine-tuning directly on the type +prediction task. Furthermore, we find that steering vectors computed from +Python code are effective at correcting TypeScript mispredictions, and vice +versa. To our knowledge, this is the first evidence of its kind to suggest that +CodeLLMs learn task representations that transfer across languages. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ On Higher-Order Reachability Games vs May Reachability + + +
+ We consider the reachability problem for higher-order functional programs and +study the relationship between reachability games (i.e., the reachability +problem for programs with angelic and demonic nondeterminism) and +may-reachability (i.e., the reachability problem for programs with only angelic +nondeterminism). We show that reachability games for order-n programs can be +reduced to may-reachability for order-(n+1) programs, and vice versa. We +formalize the reductions by using higher-order fixpoint logic and prove their +correctness. We also discuss applications of the reductions to higher-order +program verification. + +
+
+
+
+
+
+
+
+ + Performance Profiling 2 + +
+
+
+ + ☆ Automatic Generation of Fast and Accurate Performance Models for Deep + Neural Network Accelerators + + +
+ Implementing Deep Neural Networks (DNNs) on resource-constrained edge devices +is a challenging task that requires tailored hardware accelerator architectures +and a clear understanding of their performance characteristics when executing +the intended AI workload. To facilitate this, we present an automated +generation approach for fast performance models to accurately estimate the +latency of a DNN mapped onto systematically modeled and concisely described +accelerator architectures. Using our accelerator architecture description +method, we modeled representative DNN accelerators such as Gemmini, UltraTrail, +Plasticine-derived, and a parameterizable systolic array. Together with DNN +mappings for those modeled architectures, we perform a combined DNN/hardware +dependency graph analysis, which enables us, in the best case, to evaluate only +154 loop kernel iterations to estimate the performance for 4.19 billion +instructions achieving a significant speedup. We outperform regression and +analytical models in terms of mean absolute percentage error (MAPE) compared to +simulation results, while being several magnitudes faster than an RTL +simulation. + +
+
+ comment: Accepted version for: ACM Transactions on Embedded Computing Systems +
+
+
+
+
+ + ♻ ☆ Confidential Computing on nVIDIA H100 GPU: A Performance Benchmark Study + + +
+ This report evaluates the performance impact of enabling Trusted Execution +Environments (TEE) on nVIDIA H100 GPUs for large language model (LLM) inference +tasks. We benchmark the overhead introduced by TEE mode across various LLMs and +token lengths, with a particular focus on the bottleneck caused by CPU-GPU data +transfers via PCIe. Our results indicate that while there is minimal +computational overhead within the GPU, the overall performance penalty is +primarily attributable to data transfer. For the majority of typical LLM +queries, the overhead remains below 5%, with larger models and longer sequences +experiencing nearly zero overhead. + +
+
+
+
+
+
+
+
+ + Operation Systems 1 + +
+
+
+ + ♻ ☆ Hardware-Assisted Virtualization of Neural Processing Units for Cloud + Platforms MICRO'24 + + +
+ Cloud platforms today have been deploying hardware accelerators like neural +processing units (NPUs) for powering machine learning (ML) inference services. +To maximize the resource utilization while ensuring reasonable quality of +service, a natural approach is to virtualize NPUs for efficient resource +sharing for multi-tenant ML services. However, virtualizing NPUs for modern +cloud platforms is not easy. This is not only due to the lack of system +abstraction support for NPU hardware, but also due to the lack of architectural +and ISA support for enabling fine-grained dynamic operator scheduling for +virtualized NPUs. + We present Neu10, a holistic NPU virtualization framework. We investigate +virtualization techniques for NPUs across the entire software and hardware +stack. Neu10 consists of (1) a flexible NPU abstraction called vNPU, which +enables fine-grained virtualization of the heterogeneous compute units in a +physical NPU (pNPU); (2) a vNPU resource allocator that enables pay-as-you-go +computing model and flexible vNPU-to-pNPU mappings for improved resource +utilization and cost-effectiveness; (3) an ISA extension of modern NPU +architecture for facilitating fine-grained tensor operator scheduling for +multiple vNPUs. We implement Neu10 based on a production-level NPU simulator. +Our experiments show that Neu10 improves the throughput of ML inference +services by up to 1.4$\times$ and reduces the tail latency by up to +4.6$\times$, while improving the NPU utilization by 1.2$\times$ on average, +compared to state-of-the-art NPU sharing approaches. + +
+
+ comment: Accepted to MICRO'24 +
+
+
+
+
+
+
+
+ + Computational Complexity 6 + +
+
+
+ + ☆ Vertex identification to a forest + + +
+ Let $\mathcal{H}$ be a graph class and $k\in\mathbb{N}$. We say a graph $G$ +admits a \emph{$k$-identification to $\mathcal{H}$} if there is a partition +$\mathcal{P}$ of some set $X\subseteq V(G)$ of size at most $k$ such that after +identifying each part in $\mathcal{P}$ to a single vertex, the resulting graph +belongs to $\mathcal{H}$. The graph parameter ${\sf id}_{\mathcal{H}}$ is +defined so that ${\sf id}_{\mathcal{H}}(G)$ is the minimum $k$ such that $G$ +admits a $k$-identification to $\mathcal{H}$, and the problem of +\textsc{Identification to $\mathcal{H}$} asks, given a graph $G$ and +$k\in\mathbb{N}$, whether ${\sf id}_{\mathcal{H}}(G)\le k$. If we set +$\mathcal{H}$ to be the class $\mathcal{F}$ of acyclic graphs, we generate the +problem \textsc{Identification to Forest}, which we show to be {\sf +NP}-complete. We prove that, when parameterized by the size $k$ of the +identification set, it admits a kernel of size $2k+1$. For our kernel we reveal +a close relation of \textsc{Identification to Forest} with the \textsc{Vertex +Cover} problem. We also study the combinatorics of the \textsf{yes}-instances +of \textsc{Identification to $\mathcal{H}$}, i.e., the class +$\mathcal{H}^{(k)}:=\{G\mid {\sf id}_{\mathcal{H}}(G)\le k\}$, {which we show +to be minor-closed for every $k$} when $\mathcal{H}$ is minor-closed. We prove +that the minor-obstructions of $\mathcal{F}^{(k)}$ are of size at most $2k+4$. +We also prove that every graph $G$ such that ${\sf id}_{\mathcal{F}}(G)$ is +sufficiently big contains as a minor either a cycle on $k$ vertices, or $k$ +disjoint triangles, or the \emph{$k$-marguerite} graph, that is the graph +obtained by $k$ disjoint triangles by identifying one vertex of each of them +into the same vertex. + +
+
+ comment: 18 pages, 5 figures +
+
+
+
+
+ + ☆ Rice-like complexity lower bounds for Boolean and uniform automata + networks + + +
+ Automata networks are a versatile model of finite discrete dynamical systems +composed of interacting entities (the automata), able to embed any directed +graph as a dynamics on its space of configurations (the set of vertices, +representing all the assignments of a state to each entity). In this world, +virtually any question is decidable by a simple exhaustive search. We lever the +Rice-like complexity lower bound, stating that any non-trivial monadic second +order logic question on the graph of its dynamics is NP-hard or coNP-hard +(given the automata network description), to bounded alphabets (including the +Boolean case). This restriction is particularly meaningful for applications to +"complex systems", where each entity has a restricted set of possible states +(its alphabet). For the non-deterministic case, trivial questions are solvable +in constant time, hence there is a sharp gap in complexity for the algorithmic +solving of concrete problems on them. For the non-deterministic case, +non-triviality is defined at bounded treewidth, which offers a structure to +establish metatheorems of complexity lower bounds. + +
+
+
+
+
+ + ☆ Journalists, Emotions, and the Introduction of Generative AI Chatbots: A + Large-Scale Analysis of Tweets Before and After the Launch of ChatGPT + + +
+ As part of a broader look at the impact of generative AI, this study +investigated the emotional responses of journalists to the release of ChatGPT +at the time of its launch. By analyzing nearly 1 million Tweets from +journalists at major U.S. news outlets, we tracked changes in emotional tone +and sentiment before and after the introduction of ChatGPT in November 2022. +Using various computational and natural language processing techniques to +measure emotional shifts in response to ChatGPT's release, we found an increase +in positive emotion and a more favorable tone post-launch, suggesting initial +optimism toward AI's potential. This research underscores the pivotal role of +journalists as interpreters of technological innovation and disruption, +highlighting how their emotional reactions may shape public narratives around +emerging technologies. The study contributes to understanding the intersection +of journalism, emotion, and AI, offering insights into the broader societal +impact of generative AI tools. + +
+
+
+
+
+ + ♻ ☆ Improved Hardness Results of the Cardinality-Based Minimum s-t Cut + Problem in Hypergraphs + + +
+ In hypergraphs an edge that crosses a cut can be split in several ways, +depending on how many nodes are placed on each side of the cut. A +cardinality-based splitting function assigns a nonnegative cost of $w_i$ for +each cut hyperedge $e$ with exactly $i$ nodes on the side of the cut that +contains the minority of nodes from $e$. The cardinality-based minimum $s$-$t$ +cut aims to find an $s$-$t$ cut with minimum total cost. Assuming the costs +$w_i$ are polynomially bounded by the input size and $w_0=0$ and $w_1=1$, we +show that the problem becomes NP-hard outside the submodular region found by +Veldt et al. Our result also holds for $k$-uniform hypergraphs with $k \geq 4$. +Specifically for $4$-uniform hypergraphs we show that the problem is NP-hard +for all $w_2>2$, and additionally prove that the \textsc{No-Even-Split} problem +is NP-hard. + +
+
+
+
+
+ + ♻ ☆ Approximately counting maximal independent set is equivalent to #SAT + + +
+ A maximal independent set is an independent set that is not a subset of any +other independent set. It is also the key problem of mathematics, computer +science, and other fields. A counting problem is a type of computational +problem that associated with the number of solutions. Besides, counting +problems help us better understand several fields such as algorithm analysis, +complexity theory, artificial intelligence, etc. The problem of counting +maximal independent sets is #P-complete. So it is natural to think about +approximate counting for maximal independent sets problem. In this article, we +study the complexity of approximately counting maximal independent sets. +Specifically, we are the first to prove that the #MIS problem is +AP-interreducible with the #SAT of a given general graph. + +
+
+ comment: After discussion, this is already known in JCSS (with the + arXiv:1411.6829),proving that approximately counting MIS in bipartite graphs + is equivalent to #SAT under AP-reductions, it is a stronger result if it + restricts to bipartite graphs, which implies it for general graphs. + Therefore, this paper tends to be more of a direct proof exercise +
+
+
+
+
+ + ♻ ☆ Maximum $k$- vs. $\ell$-colourings of graphs + + +
+ We present polynomial-time SDP-based algorithms for the following problem: +For fixed $k \leq \ell$, given a real number $\epsilon>0$ and a graph $G$ that +admits a $k$-colouring with a $\rho$-fraction of the edges coloured properly, +it returns an $\ell$-colouring of $G$ with an $(\alpha \rho - +\epsilon)$-fraction of the edges coloured properly in polynomial time in $G$ +and $1 / \epsilon$. Our algorithms are based on the algorithms of Frieze and +Jerrum [Algorithmica'97] and of Karger, Motwani and Sudan [JACM'98]. + When $k$ is fixed and $\ell$ grows large, our algorithm achieves an +approximation ratio of $\alpha = 1 - o(1 / \ell)$. When $k, \ell$ are both +large, our algorithm achieves an approximation ratio of $\alpha = 1 - 1 / \ell ++ 2 \ln \ell / k \ell - o(\ln \ell / k \ell) - O(1 / k^2)$; if we fix $d = \ell +- k$ and allow $k, \ell$ to grow large, this is $\alpha = 1 - 1 / \ell + 2 \ln +\ell / k \ell - o(\ln \ell / k \ell)$. + By extending the results of Khot, Kindler, Mossel and O'Donnell [SICOMP'07] +to the promise setting, we show that for large $k$ and $\ell$, assuming Khot's +Unique Games Conjecture (\UGC), it is \NP-hard to achieve an approximation +ratio $\alpha$ greater than $1 - 1 / \ell + 2 \ln \ell / k \ell + o(\ln \ell / +k \ell)$, provided that $\ell$ is bounded by a function that is +$o(\exp(\sqrt[3]{k}))$. For the case where $d = \ell - k$ is fixed, this bound +matches the performance of our algorithm up to $o(\ln \ell / k \ell)$. +Furthermore, by extending the results of Guruswami and Sinop [ToC'13] to the +promise setting, we prove that it is \NP-hard to achieve an approximation ratio +greater than $1 - 1 / \ell + 8 \ln \ell / k \ell + o(\ln \ell / k \ell)$, +provided again that $\ell$ is bounded as before (but this time without assuming +the \UGC). + +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 4 + +
+
+
+ + ☆ Run supports and initial algebra supports of weighted automata + + +
+ We consider weighted automata over words and over trees where the weight +algebras are strong bimonoids, i.e., semirings which may lack distributivity. +It is well known that, for each such weighted automaton, its run semantics and +its initial algebra semantics can be different, due to the presence of +nondeterminism and the absence of distributivity. Here we investigate the +question under which conditions on the strong bimonoid the support of the run +semantics equals the support of the initial algebra semantics. We prove a +characterization of this equality in terms of strongly zero-sum-free strong +bimonoids (for weighted automata over words) and in terms of bi-strongly +zero-sum-free strong bimonoids (for weighted automata over trees). We also +consider shortly the images of the two semantics functions. + +
+
+
+
+
+ + ♻ ☆ Submonoid Membership in n-dimensional lamplighter groups and S-unit + equations + + +
+ We show that Submonoid Membership is decidable in n-dimensional lamplighter +groups $(\mathbb{Z}/p\mathbb{Z}) \wr \mathbb{Z}^n$ for any prime $p$ and +integer $n$. More generally, we show decidability of Submonoid Membership in +semidirect products of the form $\mathcal{Y} \rtimes \mathbb{Z}^n$, where +$\mathcal{Y}$ is any finitely presented module over the Laurent polynomial ring +$\mathbb{F}_p[X_1^{\pm}, \ldots, X_n^{\pm}]$. Combined with a result of Shafrir +(2024), this gives the first example of a group $G$ and a finite index subgroup +$\widetilde{G} \leq G$, such that Submonoid Membership is decidable in +$\widetilde{G}$ but undecidable in $G$. + To obtain our decidability result, we reduce Submonoid Membership in +$\mathcal{Y} \rtimes \mathbb{Z}^n$ to solving S-unit equations over +$\mathbb{F}_p[X_1^{\pm}, \ldots, X_n^{\pm}]$-modules. We show that the solution +set of such equations is effectively $p$-automatic, extending a result of +Adamczewski and Bell (2012). As an intermediate result, we also obtain that the +solution set of the Knapsack Problem in $\mathcal{Y} \rtimes \mathbb{Z}^n$ is +effectively $p$-automatic. + +
+
+ comment: corrected a mistake in Lemma 5.9, modified Lemma 5.8, some other + minor changes +
+
+
+
+
+ + ♻ ☆ Subsequences in Bounded Ranges: Matching and Analysis Problems + + +
+ In this paper, we consider a variant of the classical algorithmic problem of +checking whether a given word $v$ is a subsequence of another word $w$. More +precisely, we consider the problem of deciding, given a number $p$ (defining a +range-bound) and two words $v$ and $w$, whether there exists a factor +$w[i:i+p-1]$ (or, in other words, a range of length $p$) of $w$ having $v$ as +subsequence (i.\,e., $v$ occurs as a subsequence in the bounded range +$w[i:i+p-1]$). We give matching upper and lower quadratic bounds for the time +complexity of this problem. Further, we consider a series of algorithmic +problems in this setting, in which, for given integers $k$, $p$ and a word $w$, +we analyse the set $p$-Subseq$_{k}(w)$ of all words of length $k$ which occur +as subsequence of some factor of length $p$ of $w$. Among these, we consider +the $k$-universality problem, the $k$-equivalence problem, as well as problems +related to absent subsequences. Surprisingly, unlike the case of the classical +model of subsequences in words where such problems have efficient solutions in +general, we show that most of these problems become intractable in the new +setting when subsequences in bounded ranges are considered. Finally, we provide +an example of how some of our results can be applied to subsequence matching +problems for circular words. + +
+
+ comment: Extended version of a paper which will appear in the proceedings of + the 16th International Conference on Reachability Problems, RP 2022 +
+
+
+
+
+ + ♻ ☆ Learning Realtime One-Counter Automata + + +
+ We present a new learning algorithm for realtime one-counter automata. Our +algorithm uses membership and equivalence queries as in Angluin's L* algorithm, +as well as counter value queries and partial equivalence queries. In a partial +equivalence query, we ask the teacher whether the language of a given +finite-state automaton coincides with a counter-bounded subset of the target +language. We evaluate an implementation of our algorithm on a number of random +benchmarks and on a use case regarding efficient JSON-stream validation. + +
+
+ comment: 55 pages, 9 figures, submitted to TACAS 2022 +
+
+
+
+
+
+
+
+ + Logic in Computer Science 5 + +
+
+
+ + ☆ Rice-like complexity lower bounds for Boolean and uniform automata + networks + + +
+ Automata networks are a versatile model of finite discrete dynamical systems +composed of interacting entities (the automata), able to embed any directed +graph as a dynamics on its space of configurations (the set of vertices, +representing all the assignments of a state to each entity). In this world, +virtually any question is decidable by a simple exhaustive search. We lever the +Rice-like complexity lower bound, stating that any non-trivial monadic second +order logic question on the graph of its dynamics is NP-hard or coNP-hard +(given the automata network description), to bounded alphabets (including the +Boolean case). This restriction is particularly meaningful for applications to +"complex systems", where each entity has a restricted set of possible states +(its alphabet). For the non-deterministic case, trivial questions are solvable +in constant time, hence there is a sharp gap in complexity for the algorithmic +solving of concrete problems on them. For the non-deterministic case, +non-triviality is defined at bounded treewidth, which offers a structure to +establish metatheorems of complexity lower bounds. + +
+
+
+
+
+ + ☆ Winning Strategy Templates for Stochastic Parity Games towards + Permissive and Resilient Control + + +
+ Stochastic games play an important role for many purposes such as the control +of cyber-physical systems (CPS), where the controller and the environment are +modeled as players. Conventional algorithms typically solve the game for a +single winning strategy in order to develop a controller. However, in +applications such as CPS control, permissive controllers are crucial as they +allow the controlled system to adapt if additional constraints need to be +imposed and also remain resilient to system changes at runtime. In this work, +we generalize the concept of permissive winning strategy templates, introduced +by Anand et al. at TACAS and CAV 2023 for deterministic games, to encompass +stochastic games. These templates represent an infinite number of winning +strategies and can adapt strategies to system changes efficiently. We focus on +five key winning objectives -- safety, reachability, B\"uchi, co-B\"uchi, and +parity -- and present algorithms to construct templates for each objective. In +addition, we propose a novel method to extract a winning strategy from a +template and provide discussions on template comparison. + +
+
+
+
+
+ + ☆ The Challenges of Effective AGM Belief Contraction + + +
+ Despite the significant interest in extending the AGM paradigm of belief +change beyond finitary logics, the computational aspects of AGM have remained +almost untouched. We investigate the computability of AGM contraction on +non-finitary logics, and show an intriguing negative result: there are +infinitely many uncomputable AGM contraction functions in such logics. +Drastically, even if we restrict the theories used to represent epistemic +states, in all non-trivial cases, the uncomputability remains. On the positive +side, we identify an infinite class of computable AGM contraction functions on +Linear Temporal Logic (LTL). We use B\"uchi automata to construct such +functions as well as to represent and reason about LTL knowledge. + +
+
+ comment: 20 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ On Higher-Order Reachability Games vs May Reachability + + +
+ We consider the reachability problem for higher-order functional programs and +study the relationship between reachability games (i.e., the reachability +problem for programs with angelic and demonic nondeterminism) and +may-reachability (i.e., the reachability problem for programs with only angelic +nondeterminism). We show that reachability games for order-n programs can be +reduced to may-reachability for order-(n+1) programs, and vice versa. We +formalize the reductions by using higher-order fixpoint logic and prove their +correctness. We also discuss applications of the reductions to higher-order +program verification. + +
+
+
+
+
+ + ♻ ☆ Parameterized Model-checking of Discrete-Timed Networks and + Symmetric-Broadcast Systems + + +
+ We study the complexity of the model-checking problem for parameterized +discrete-timed systems with arbitrarily many anonymous and identical processes, +with and without a distinguished "controller", and communicating via +synchronous rendezvous. Our framework extends the seminal work from German and +Sistla on untimed systems by adding discrete-time clocks to processes. For the +case without a controller, we show that the systems can be efficiently +simulated -- and vice versa -- by systems of untimed processes that communicate +via rendezvous and symmetric broadcast, which we call "RB-systems". Symmetric +broadcast is a novel communication primitive that allows all processes to +synchronize at once; however, it does not distinguish between sending and +receiving processes. We show that the parameterized model-checking problem for +safety specifications is pspace-complete, and for liveness specifications it is +decidable in exptime. The latter result is proved using automata theory, +rational linear programming, and geometric reasoning for solving certain +reachability questions in a new variant of vector addition systems called +"vector rendezvous systems". We believe these proof techniques are of +independent interest and will be useful in solving related problems. For the +case with a controller, we show that the parameterized model-checking problems +for RB-systems and systems with asymmetric broadcast as a primitive are +inter-reducible. This allows us to prove that for discrete timed-networks with +a controller the parameterized model-checking problem is undecidable for +liveness specifications. Our work exploits the intimate connection between +parameterized discrete-timed systems and systems of processes communicating via +broadcast, providing a rare and surprising decidability result for liveness +properties of parameterized timed-systems, as well as extend work from untimed +systems to timed systems. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Performance Profiling 6 + +
+
+
+ + ☆ Anonymized Network Sensing Graph Challenge + + +
+ The MIT/IEEE/Amazon GraphChallenge encourages community approaches to +developing new solutions for analyzing graphs and sparse data derived from +social media, sensor feeds, and scientific data to discover relationships +between events as they unfold in the field. The anonymized network sensing +Graph Challenge seeks to enable large, open, community-based approaches to +protecting networks. Many large-scale networking problems can only be solved +with community access to very broad data sets with the highest regard for +privacy and strong community buy-in. Such approaches often require +community-based data sharing. In the broader networking community (commercial, +federal, and academia) anonymized source-to-destination traffic matrices with +standard data sharing agreements have emerged as a data product that can meet +many of these requirements. This challenge provides an opportunity to highlight +novel approaches for optimizing the construction and analysis of anonymized +traffic matrices using over 100 billion network packets derived from the +largest Internet telescope in the world (CAIDA). This challenge specifies the +anonymization, construction, and analysis of these traffic matrices. A +GraphBLAS reference implementation is provided, but the use of GraphBLAS is not +required in this Graph Challenge. As with prior Graph Challenges the goal is to +provide a well-defined context for demonstrating innovation. Graph Challenge +participants are free to select (with accompanying explanation) the Graph +Challenge elements that are appropriate for highlighting their innovations. + +
+
+ comment: Accepted to IEEE HPEC 2024 +
+
+
+
+
+ + ☆ Microarchitectural comparison and in-core modeling of state-of-the-art + CPUs: Grace, Sapphire Rapids, and Genoa + + +
+ With Nvidia's release of the Grace Superchip, all three big semiconductor +companies in HPC (AMD, Intel, Nvidia) are currently competing in the race for +the best CPU. In this work we analyze the performance of these state-of-the-art +CPUs and create an accurate in-core performance model for their +microarchitectures Zen 4, Golden Cove, and Neoverse V2, extending the Open +Source Architecture Code Analyzer (OSACA) tool and comparing it with LLVM-MCA. +Starting from the peculiarities and up- and downsides of a single core, we +extend our comparison by a variety of microbenchmarks and the capabilities of a +full node. The "write-allocate (WA) evasion" feature, which can automatically +reduce the memory traffic caused by write misses, receives special attention; +we show that the Grace Superchip has a next-to-optimal implementation of WA +evasion, and that the only way to avoid write allocates on Zen 4 is the +explicit use of non-temporal stores. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Computational Algorithms for the Product Form Solution of Closed Queuing + Networks with Finite Buffers and Skip-Over Policy + + +
+ Closed queuing networks with finite capacity buffers and skip-over policies +are fundamental models in the performance evaluation of computer and +communication systems. This technical report presents the details of +computational algorithms to derive the key performance metrics for such +networks. The primary focus is on the efficient computation of the +normalization constant, which is critical for determining the steady-state +probabilities of the network states under investigation. A convolution +algorithm is proposed, which paves the way for the computation of key +performance indices, such as queue length distribution and throughput, +accommodating the intricacies introduced by finite capacity constraints and +skip-over mechanisms. Finally, an extension of the traditional Mean Value +Analysis algorithm addressing numerical stability is provided. The approaches +discussed here allow make the investigation of large-scale networks feasible +and enable the development of robust implementations of these techniques for +practical use. + +
+
+
+
+
+ + ☆ Repr Types: One Abstraction to Rule Them All + + +
+ The choice of how to represent an abstract type can have a major impact on +the performance of a program, yet mainstream compilers cannot perform +optimizations at such a high level. When dealing with optimizations of data +type representations, an important feature is having extensible +representation-flexible data types; the ability for a programmer to add new +abstract types and operations, as well as concrete implementations of these, +without modifying the compiler or a previously defined library. Many research +projects support high-level optimizations through static analysis, +instrumentation, or benchmarking, but they are all restricted in at least one +aspect of extensibility. + This paper presents a new approach to representation-flexible data types +without such restrictions and which still finds efficient optimizations. Our +approach centers around a single built-in type $\texttt{repr}$ and function +overloading with cost annotations for operation implementations. We evaluate +our approach (i) by defining a universal collection type as a library, a single +type for all conventional collections, and (ii) by designing and implementing a +representation-flexible graph library. Programs using $\texttt{repr}$ types are +typically faster than programs with idiomatic representation choices -- +sometimes dramatically so -- as long as the compiler finds good implementations +for all operations. Our compiler performs the analysis efficiently by finding +optimized solutions quickly and by reusing previous results to avoid +recomputations. + +
+
+ comment: 25 pages, 11 figures +
+
+
+
+
+ + ☆ E-QUARTIC: Energy Efficient Edge Ensemble of Convolutional Neural + Networks for Resource-Optimized Learning SP + + +
+ Ensemble learning is a meta-learning approach that combines the predictions +of multiple learners, demonstrating improved accuracy and robustness. +Nevertheless, ensembling models like Convolutional Neural Networks (CNNs) +result in high memory and computing overhead, preventing their deployment in +embedded systems. These devices are usually equipped with small batteries that +provide power supply and might include energy-harvesting modules that extract +energy from the environment. In this work, we propose E-QUARTIC, a novel Energy +Efficient Edge Ensembling framework to build ensembles of CNNs targeting +Artificial Intelligence (AI)-based embedded systems. Our design outperforms +single-instance CNN baselines and state-of-the-art edge AI solutions, improving +accuracy and adapting to varying energy conditions while maintaining similar +memory requirements. Then, we leverage the multi-CNN structure of the designed +ensemble to implement an energy-aware model selection policy in +energy-harvesting AI systems. We show that our solution outperforms the +state-of-the-art by reducing system failure rate by up to 40% while ensuring +higher average output qualities. Ultimately, we show that the proposed design +enables concurrent on-device training and high-quality inference execution at +the edge, limiting the performance and energy overheads to less than 0.04%. + +
+
+ comment: Accepted by the 30th Asia and South Pacific Design Automation + Conference (ASP-DAC 2025) +
+
+
+
+
+ + ♻ ☆ RIS-Assisted Received Adaptive Spatial Modulation for Wireless + Communication + + +
+ A novel wireless transmission scheme, as named the reconfigurable intelligent +surface (RIS)-assisted received adaptive spatial modulation (RASM) scheme, is +proposed in this paper. In this scheme, the adaptive spatial modulation +(ASM)-based antennas selection works at the receiver by employing the +characteristics of the RIS in each time slot, where the signal-to-noise ratio +at specific selected antennas can be further enhanced with near few powers. +Besides for the bits from constellation symbols, the extra bits can be mapped +into the indices of receive antenna combinations and conveyed to the receiver +through the ASM-based antenna-combination selection, thus providing higher +spectral efficiency. To explicitly present the RASM scheme, the analytical +performance of bit error rate of it is discussed in this paper. As a trade-off +selection, the proposed scheme shows higher spectral efficiency and remains the +satisfactory error performance. Simulation and analytical results demonstrate +the better performance and exhibit more potential to apply in practical +wireless communication. + +
+
+
+
+
+
+
+
+ + Hardware Architecturea 6 + +
+
+
+ + ☆ Photonic Quantum Computers + + +
+ In the pursuit of scalable and fault-tolerant quantum computing +architectures, photonic-based quantum computers have emerged as a leading +frontier. This article provides a comprehensive overview of advancements in +photonic quantum computing, developed by leading industry players, examining +current performance, architectural designs, and strategies for developing +large-scale, fault-tolerant photonic quantum computers. It also highlights +recent groundbreaking experiments that leverage the unique advantages of +photonic technologies, underscoring their transformative potential. This review +captures a pivotal moment of photonic quantum computing in the noisy +intermediate-scale quantum (NISQ) era, offering insights into how photonic +quantum computers might reshape the future of quantum computing. + +
+
+ comment: 47 pages, 16 figures +
+
+
+
+
+ + ☆ Rethinking Programmed I/O for Fast Devices, Cheap Cores, and Coherent + Interconnects + + +
+ Conventional wisdom holds that an efficient interface between an OS running +on a CPU and a high-bandwidth I/O device should be based on Direct Memory +Access (DMA), descriptor rings, and interrupts: DMA offloads transfers from the +CPU, descriptor rings provide buffering and queuing, and interrupts facilitate +asynchronous interaction between cores and device with a lightweight +notification mechanism. In this paper we question this wisdom in the light of +modern hardware and workloads, particularly in cloud servers. We argue that the +assumptions that led to this model are obsolete, and in many use-cases use of +programmed I/O, where the CPU explicitly transfers data and control information +to and from a device via loads and stores, actually results in a more efficient +system. We quantitatively demonstrate these advantages using three use-cases: +fine-grained RPC-style invocation of functions on an accelerator, offloading of +operators in a streaming dataflow engine, and a network interface targeting for +serverless functions. Moreover, we show that while these advantages are +significant over a modern PCIe peripheral bus, a truly cache-coherent +interconnect offers significant additional efficiency gains. + +
+
+
+
+
+ + ☆ Dynamic Simultaneous Multithreaded Arch + + +
+ This paper presents the Dynamic Simultaneous Multi-threaded Architecture +(DSMT). DSMT efficiently exe-cutes multiple threads from a single program on a +SMT processor core. To accomplish this, threads are generated dynamically from +a predictable flow of control and then executed speculatively. Data obtained +during the single context non-speculative execution phase of DSMT is used as a +hint to speculate the posterior behavior of multiple threads. DSMT employs +simple mechanisms based on state bits that keep track of inter-thread +dependencies in registers and memory, synchronize thread execution, and control +recovery from misspeculation. Moreover, DSMT utilizes a novel greedy policy for +choosing those sections of code which provide the highest performance based on +their past execution history. The DSMT architecture was simulated with a new +cycle-accurate, execution-driven simulator. Our simulation results show that +DSMT has very good potential to improve SMT performance, even when only a +single program is available. However, we found that dynamic thread behavior +together with fre-quent misspeculation may also produce diminishing re-turns in +performance. Therefore, the challenge is to max-imize the amount of +thread-level parallelism that DSMT is capable of exploiting and at the same +time reduce the fre-quency of misspeculations. + +
+
+
+
+
+ + ☆ C3-VQA: Cryogenic Counter-based Co-processor for Variational Quantum + Algorithms + + +
+ Cryogenic quantum computers play a leading role in demonstrating quantum +advantage. Given the severe constraints on the cooling capacity in cryogenic +environments, thermal design is crucial for the scalability of these computers. +The sources of heat dissipation include passive inflow via inter-temperature +wires and the power consumption of components located in the cryostat, such as +wire amplifiers and quantum-classical interfaces. Thus, a critical challenge is +to reduce the number of wires by reducing the required inter-temperature +bandwidth while maintaining minimal additional power consumption in the +cryostat. One solution to address this challenge is near-data processing using +ultra-low-power computational logic within the cryostat. Based on the workload +analysis and domain-specific system design focused on Variational Quantum +Algorithms (VQAs), we propose the Cryogenic Counter-based Co-processor for VQAs +(C3-VQA) to enhance the design scalability of cryogenic quantum computers under +the thermal constraint. The C3-VQA utilizes single-flux-quantum logic, which is +an ultra-low-power superconducting digital circuit that operates at the 4 K +environment. The C3-VQA precomputes a part of the expectation value +calculations for VQAs and buffers intermediate values using simple bit +operation units and counters in the cryostat, thereby reducing the required +inter-temperature bandwidth with small additional power consumption. +Consequently, the C3-VQA reduces the number of wires, leading to a reduction in +the total heat dissipation in the cryostat. Our evaluation shows that the +C3-VQA reduces the total heat dissipation at the 4 K stage by 30% and 81% under +sequential-shot and parallel-shot execution scenarios, respectively. +Furthermore, a case study in quantum chemistry shows that the C3-VQA reduces +total heat dissipation by 87% with a 10,000-qubit system. + +
+
+ comment: 15 pages, 9 figures, 5 tables. This is an extention of + arXiv:2403.00363 and arXiv:2310.01630 +
+
+
+
+
+ + ☆ Efficient and Reliable Vector Similarity Search Using Asymmetric + Encoding with NAND-Flash for Many-Class Few-Shot Learning + + +
+ While memory-augmented neural networks (MANNs) offer an effective solution +for few-shot learning (FSL) by integrating deep neural networks with external +memory, the capacity requirements and energy overhead of data movement become +enormous due to the large number of support vectors in many-class FSL +scenarios. Various in-memory search solutions have emerged to improve the +energy efficiency of MANNs. NAND-based multi-bit content addressable memory +(MCAM) is a promising option due to its high density and large capacity. +Despite its potential, MCAM faces limitations such as a restricted number of +word lines, limited quantization levels, and non-ideal effects like varying +string currents and bottleneck effects, which lead to significant accuracy +drops. To address these issues, we propose several innovative methods. First, +the Multi-bit Thermometer Code (MTMC) leverages the extensive capacity of MCAM +to enhance vector precision using cumulative encoding rules, thereby mitigating +the bottleneck effect. Second, the Asymmetric vector similarity search (AVSS) +reduces the precision of the query vector while maintaining that of the support +vectors, thereby minimizing the search iterations and improving efficiency in +many-class scenarios. Finally, the Hardware-Aware Training (HAT) method +optimizes controller training by modeling the hardware characteristics of MCAM, +thus enhancing the reliability of the system. Our integrated framework reduces +search iterations by up to 32 times, and increases overall accuracy by 1.58% to +6.94%. + +
+
+
+
+
+ + ☆ LlamaF: An Efficient Llama2 Architecture Accelerator on Embedded FPGAs + + +
+ Large language models (LLMs) have demonstrated remarkable abilities in +natural language processing. However, their deployment on resource-constrained +embedded devices remains difficult due to memory and computational demands. In +this paper, we present an FPGA-based accelerator designed to improve LLM +inference performance on embedded FPGAs. We employ post-training quantization +to reduce model size and optimize for off-chip memory bandwidth. Our design +features asynchronous computation and a fully pipelined accelerator for +matrix-vector multiplication. Experiments of the TinyLlama 1.1B model on a +Xilinx ZCU102 platform show a 14.3-15.8x speedup and a 6.1x power efficiency +improvement over running exclusively on ZCU102 processing system (PS). + +
+
+
+
+
+
+
+
+ + Distributed, Parallel, and Cluster Computing 9 + +
+
+
+ + ☆ A Study on Asynchronous Vote-based Blockchains + + +
+ Vote-based blockchains construct a state machine replication (SMR) system +among participating nodes, using Byzantine Fault Tolerance (BFT) consensus +protocols to transition from one state to another. Currently, they rely on +either synchronous or partially synchronous networks with leader-based +coordination or costly Asynchronous Common Subset (ACS) protocols in +asynchronous settings, making them impractical for large-scale asynchronous +applications. + To make Asynchronous SMR scalable, this paper proposes a \emph{validated +strong} BFT consensus model that allows leader-based coordination in +asynchronous settings. Our BFT consensus model offers the same level of +tolerance as binary byzantine agreement but does not demand consistency among +honest nodes before they vote. An SMR using our model allows nodes to operate +in different, tentative, but mutually exclusive states until they eventually +converge on the same state. We propose an asynchronous BFT protocol for +vote-based blockchains employing our consensus model to address several +critical challenges: how to ensure that nodes eventually converge on the same +state across voting rounds, how to assure that a blockchain will steadily +progress through epochs while reaching consensus for previous epochs, and how +to maintain robust byzantine fault tolerance. + Our protocol greatly reduces message complexity and is the first one to +achieve linear view changes without relying on threshold signatures. We prove +that an asynchronous blockchain built on our protocol can operate with the +\emph{same} simplicity and efficiency as partially synchronous blockchains +built on, e.g. HotStuff-2. This facilitates deploying asynchronous blockchains +across large-scale networks. + +
+
+
+
+
+ + ☆ Microarchitectural comparison and in-core modeling of state-of-the-art + CPUs: Grace, Sapphire Rapids, and Genoa + + +
+ With Nvidia's release of the Grace Superchip, all three big semiconductor +companies in HPC (AMD, Intel, Nvidia) are currently competing in the race for +the best CPU. In this work we analyze the performance of these state-of-the-art +CPUs and create an accurate in-core performance model for their +microarchitectures Zen 4, Golden Cove, and Neoverse V2, extending the Open +Source Architecture Code Analyzer (OSACA) tool and comparing it with LLVM-MCA. +Starting from the peculiarities and up- and downsides of a single core, we +extend our comparison by a variety of microbenchmarks and the capabilities of a +full node. The "write-allocate (WA) evasion" feature, which can automatically +reduce the memory traffic caused by write misses, receives special attention; +we show that the Grace Superchip has a next-to-optimal implementation of WA +evasion, and that the only way to avoid write allocates on Zen 4 is the +explicit use of non-temporal stores. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Dynamic Simultaneous Multithreaded Arch + + +
+ This paper presents the Dynamic Simultaneous Multi-threaded Architecture +(DSMT). DSMT efficiently exe-cutes multiple threads from a single program on a +SMT processor core. To accomplish this, threads are generated dynamically from +a predictable flow of control and then executed speculatively. Data obtained +during the single context non-speculative execution phase of DSMT is used as a +hint to speculate the posterior behavior of multiple threads. DSMT employs +simple mechanisms based on state bits that keep track of inter-thread +dependencies in registers and memory, synchronize thread execution, and control +recovery from misspeculation. Moreover, DSMT utilizes a novel greedy policy for +choosing those sections of code which provide the highest performance based on +their past execution history. The DSMT architecture was simulated with a new +cycle-accurate, execution-driven simulator. Our simulation results show that +DSMT has very good potential to improve SMT performance, even when only a +single program is available. However, we found that dynamic thread behavior +together with fre-quent misspeculation may also produce diminishing re-turns in +performance. Therefore, the challenge is to max-imize the amount of +thread-level parallelism that DSMT is capable of exploiting and at the same +time reduce the fre-quency of misspeculations. + +
+
+
+
+
+ + ☆ DFDG: Data-Free Dual-Generator Adversarial Distillation for One-Shot + Federated Learning + + +
+ Federated Learning (FL) is a distributed machine learning scheme in which +clients jointly participate in the collaborative training of a global model by +sharing model information rather than their private datasets. In light of +concerns associated with communication and privacy, one-shot FL with a single +communication round has emerged as a de facto promising solution. However, +existing one-shot FL methods either require public datasets, focus on model +homogeneous settings, or distill limited knowledge from local models, making it +difficult or even impractical to train a robust global model. To address these +limitations, we propose a new data-free dual-generator adversarial distillation +method (namely DFDG) for one-shot FL, which can explore a broader local models' +training space via training dual generators. DFDG is executed in an adversarial +manner and comprises two parts: dual-generator training and dual-model +distillation. In dual-generator training, we delve into each generator +concerning fidelity, transferability and diversity to ensure its utility, and +additionally tailor the cross-divergence loss to lessen the overlap of dual +generators' output spaces. In dual-model distillation, the trained dual +generators work together to provide the training data for updates of the global +model. At last, our extensive experiments on various image classification tasks +show that DFDG achieves significant performance gains in accuracy compared to +SOTA baselines. + +
+
+ comment: Accepted by ICDM2024 main conference (long paper) +
+
+
+
+
+ + ☆ Cooperative Inference with Interleaved Operator Partitioning for CNNs + + +
+ Deploying deep learning models on Internet of Things (IoT) devices often +faces challenges due to limited memory resources and computing capabilities. +Cooperative inference is an important method for addressing this issue, +requiring the partitioning and distributive deployment of an intelligent model. +To perform horizontal partitions, existing cooperative inference methods take +either the output channel of operators or the height and width of feature maps +as the partition dimensions. In this manner, since the activation of operators +is distributed, they have to be concatenated together before being fed to the +next operator, which incurs the delay for cooperative inference. In this paper, +we propose the Interleaved Operator Partitioning (IOP) strategy for CNN models. +By partitioning an operator based on the output channel dimension and its +successive operator based on the input channel dimension, activation +concatenation becomes unnecessary, thereby reducing the number of communication +connections, which consequently reduces cooperative inference de-lay. Based on +IOP, we further present a model segmentation algorithm for minimizing +cooperative inference time, which greedily selects operators for IOP pairing +based on the inference delay benefit harvested. Experimental results +demonstrate that compared with the state-of-the-art partition approaches used +in CoEdge, the IOP strategy achieves 6.39% ~ 16.83% faster acceleration and +reduces peak memory footprint by 21.22% ~ 49.98% for three classical image +classification models. + +
+
+
+
+
+ + ☆ Self-Supervised Inference of Agents in Trustless Environments + + +
+ In this paper, we propose a novel approach where agents can form swarms to +produce high-quality responses effectively. This is accomplished by utilizing +agents capable of data inference and ranking, which can be effectively +implemented using LLMs as response classifiers. We assess existing approaches +for trustless agent inference, define our methodology, estimate practical +parameters, and model various types of malicious agent attacks. Our method +leverages the collective intelligence of swarms, ensuring robust and efficient +decentralized AI inference with better accuracy, security, and reliability. We +show that our approach is an order of magnitude faster than other trustless +inference strategies reaching less than 125 ms validation latency. + +
+
+
+
+
+ + ☆ E-QUARTIC: Energy Efficient Edge Ensemble of Convolutional Neural + Networks for Resource-Optimized Learning SP + + +
+ Ensemble learning is a meta-learning approach that combines the predictions +of multiple learners, demonstrating improved accuracy and robustness. +Nevertheless, ensembling models like Convolutional Neural Networks (CNNs) +result in high memory and computing overhead, preventing their deployment in +embedded systems. These devices are usually equipped with small batteries that +provide power supply and might include energy-harvesting modules that extract +energy from the environment. In this work, we propose E-QUARTIC, a novel Energy +Efficient Edge Ensembling framework to build ensembles of CNNs targeting +Artificial Intelligence (AI)-based embedded systems. Our design outperforms +single-instance CNN baselines and state-of-the-art edge AI solutions, improving +accuracy and adapting to varying energy conditions while maintaining similar +memory requirements. Then, we leverage the multi-CNN structure of the designed +ensemble to implement an energy-aware model selection policy in +energy-harvesting AI systems. We show that our solution outperforms the +state-of-the-art by reducing system failure rate by up to 40% while ensuring +higher average output qualities. Ultimately, we show that the proposed design +enables concurrent on-device training and high-quality inference execution at +the edge, limiting the performance and energy overheads to less than 0.04%. + +
+
+ comment: Accepted by the 30th Asia and South Pacific Design Automation + Conference (ASP-DAC 2025) +
+
+
+
+
+ + ☆ DiReDi: Distillation and Reverse Distillation for AIoT Applications + + +
+ Typically, the significant efficiency can be achieved by deploying different +edge AI models in various real world scenarios while a few large models manage +those edge AI models remotely from cloud servers. However, customizing edge AI +models for each user's specific application or extending current models to new +application scenarios remains a challenge. Inappropriate local training or fine +tuning of edge AI models by users can lead to model malfunction, potentially +resulting in legal issues for the manufacturer. To address aforementioned +issues, this paper proposes an innovative framework called "DiReD", which +involves knowledge DIstillation & REverse DIstillation. In the initial step, an +edge AI model is trained with presumed data and a KD process using the cloud AI +model in the upper management cloud server. This edge AI model is then +dispatched to edge AI devices solely for inference in the user's application +scenario. When the user needs to update the edge AI model to better fit the +actual scenario, the reverse distillation (RD) process is employed to extract +the knowledge: the difference between user preferences and the manufacturer's +presumptions from the edge AI model using the user's exclusive data. Only the +extracted knowledge is reported back to the upper management cloud server to +update the cloud AI model, thus protecting user privacy by not using any +exclusive data. The updated cloud AI can then update the edge AI model with the +extended knowledge. Simulation results demonstrate that the proposed "DiReDi" +framework allows the manufacturer to update the user model by learning new +knowledge from the user's actual scenario with private data. The initial +redundant knowledge is reduced since the retraining emphasizes user private +data. + +
+
+
+
+
+ + ♻ ☆ Noiseless Privacy-Preserving Decentralized Learning + + +
+ Decentralized learning (DL) enables collaborative learning without a server +and without training data leaving the users' devices. However, the models +shared in DL can still be used to infer training data. Conventional defenses +such as differential privacy and secure aggregation fall short in effectively +safeguarding user privacy in DL, either sacrificing model utility or +efficiency. We introduce Shatter, a novel DL approach in which nodes create +virtual nodes (VNs) to disseminate chunks of their full model on their behalf. +This enhances privacy by (i) preventing attackers from collecting full models +from other nodes, and (ii) hiding the identity of the original node that +produced a given model chunk. We theoretically prove the convergence of Shatter +and provide a formal analysis demonstrating how Shatter reduces the efficacy of +attacks compared to when exchanging full models between nodes. We evaluate the +convergence and attack resilience of Shatter with existing DL algorithms, with +heterogeneous datasets, and against three standard privacy attacks. Our +evaluation shows that Shatter not only renders these privacy attacks infeasible +when each node operates 16 VNs but also exhibits a positive impact on model +utility compared to standard DL. In summary, Shatter enhances the privacy of DL +while maintaining the utility and efficiency of the model. + +
+
+ comment: Accepted at PETS 2025 +
+
+
+
+
+
+
+
+ + Programming and Languages 4 + +
+
+
+ + ☆ Repr Types: One Abstraction to Rule Them All + + +
+ The choice of how to represent an abstract type can have a major impact on +the performance of a program, yet mainstream compilers cannot perform +optimizations at such a high level. When dealing with optimizations of data +type representations, an important feature is having extensible +representation-flexible data types; the ability for a programmer to add new +abstract types and operations, as well as concrete implementations of these, +without modifying the compiler or a previously defined library. Many research +projects support high-level optimizations through static analysis, +instrumentation, or benchmarking, but they are all restricted in at least one +aspect of extensibility. + This paper presents a new approach to representation-flexible data types +without such restrictions and which still finds efficient optimizations. Our +approach centers around a single built-in type $\texttt{repr}$ and function +overloading with cost annotations for operation implementations. We evaluate +our approach (i) by defining a universal collection type as a library, a single +type for all conventional collections, and (ii) by designing and implementing a +representation-flexible graph library. Programs using $\texttt{repr}$ types are +typically faster than programs with idiomatic representation choices -- +sometimes dramatically so -- as long as the compiler finds good implementations +for all operations. Our compiler performs the analysis efficiently by finding +optimized solutions quickly and by reusing previous results to avoid +recomputations. + +
+
+ comment: 25 pages, 11 figures +
+
+
+
+
+ + ☆ Weaver: A Retargetable Compiler Framework for FPQA Quantum Architectures + + +
+ While the prominent quantum computing architectures are based on +superconducting technology, new quantum hardware technologies are emerging, +such as Trapped Ions, Neutral Atoms (or FPQAs), Silicon Spin Qubits, etc. This +diverse set of technologies presents fundamental trade-offs in terms of +scalability, performance, manufacturing, and operating expenses. To manage +these diverse quantum technologies, there is a growing need for a retargetable +compiler that can efficiently adapt existing code to these emerging hardware +platforms. Such a retargetable compiler must be extensible to support new and +rapidly evolving technologies, performant with fast compilation times and +high-fidelity execution, and verifiable through rigorous equivalence checking +to ensure the functional equivalence of the retargeted code. + To this end, we present $Weaver$, the first extensible, performant, and +verifiable retargetable quantum compiler framework with a focus on FPQAs due to +their unique, promising features. $Weaver$ introduces WQASM, the first formal +extension of the standard OpenQASM quantum assembly with FPQA-specific +instructions to support their distinct capabilities. Next, $Weaver$ implements +the WOptimizer, an extensible set of FPQA-specific optimization passes to +improve execution quality. Last, the WChecker automatically checks for +equivalence between the original and the retargeted code. Our evaluation shows +that $Weaver$ improves compilation times by $10^3\times$, execution times by +$4.4\times$, and execution fidelity by $10\%$, on average, compared to +superconducting and state-of-the-art (non-retargetable) FPQA compilers. + +
+
+ comment: 11 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ AbstractBeam: Enhancing Bottom-Up Program Synthesis using Library + Learning + + +
+ LambdaBeam is a state-of-the-art, execution-guided algorithm for program +synthesis that utilizes higher-order functions, lambda functions, and iterative +loops within a Domain-Specific Language (DSL). LambdaBeam generates each +program from scratch but does not take advantage of the frequent recurrence of +program blocks or subprograms commonly found in specific domains, such as loops +for list traversal. To address this limitation, we introduce AbstractBeam: a +novel program synthesis framework designed to enhance LambdaBeam by leveraging +Library Learning. AbstractBeam identifies and integrates recurring program +structures into the DSL, optimizing the synthesis process. Our experimental +evaluations demonstrate that AbstractBeam statistically significantly (p < +0.05) outperforms LambdaBeam in the integer list manipulation domain. Beyond +solving more tasks, AbstractBeam's program synthesis is also more efficient, +requiring less time and fewer candidate programs to generate a solution. +Furthermore, our findings indicate that Library Learning effectively enhances +program synthesis in domains that are not explicitly designed to showcase its +advantages, thereby highlighting the broader applicability of Library Learning. + +
+
+
+
+
+ + ♻ ☆ QEDCartographer: Automating Formal Verification Using Reward-Free + Reinforcement Learning ICSE + + +
+ Formal verification is a promising method for producing reliable software, +but the difficulty of manually writing verification proofs severely limits its +utility in practice. Recent methods have automated some proof synthesis by +guiding a search through the proof space using a theorem prover. Unfortunately, +the theorem prover provides only the crudest estimate of progress, resulting in +effectively undirected search. To address this problem, we create +QEDCartographer, an automated proof-synthesis tool that combines supervised and +reinforcement learning to more effectively explore the proof space. +QEDCartographer incorporates the proofs' branching structure, enabling +reward-free search and overcoming the sparse reward problem inherent to formal +verification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K +theorems from 124 open-source Coq projects. QEDCartographer fully automatically +proves 21.4% of the test-set theorems. Previous search-based proof-synthesis +tools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on +supervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively. +Diva, which combines 62 tools, proves 19.2%. Comparing to the most effective +prior tool, Proverbot9001, QEDCartographer produces 34% shorter proofs 29% +faster, on average over the theorems both tools prove. Together, +QEDCartographer and non-learning-based CoqHammer prove 30.3% of the theorems, +while CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement +learning is a fruitful research direction for improving proof-synthesis tools' +search mechanisms. + +
+
+ comment: Published in the International Conference on Software Engineering + (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan + Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal + Verification Using Reward-Free Reinforcement Learning, in Proceedings of the + 47th International Conference on Software Engineering (ICSE), 2025 +
+
+
+
+
+
+
+
+ + Operation Systems 1 + +
+
+
+ + ☆ Rethinking Programmed I/O for Fast Devices, Cheap Cores, and Coherent + Interconnects + + +
+ Conventional wisdom holds that an efficient interface between an OS running +on a CPU and a high-bandwidth I/O device should be based on Direct Memory +Access (DMA), descriptor rings, and interrupts: DMA offloads transfers from the +CPU, descriptor rings provide buffering and queuing, and interrupts facilitate +asynchronous interaction between cores and device with a lightweight +notification mechanism. In this paper we question this wisdom in the light of +modern hardware and workloads, particularly in cloud servers. We argue that the +assumptions that led to this model are obsolete, and in many use-cases use of +programmed I/O, where the CPU explicitly transfers data and control information +to and from a device via loads and stores, actually results in a more efficient +system. We quantitatively demonstrate these advantages using three use-cases: +fine-grained RPC-style invocation of functions on an accelerator, offloading of +operators in a streaming dataflow engine, and a network interface targeting for +serverless functions. Moreover, we show that while these advantages are +significant over a modern PCIe peripheral bus, a truly cache-coherent +interconnect offers significant additional efficiency gains. + +
+
+
+
+
+
+
+
+ + Computational Complexity 5 + +
+
+
+ + ☆ Communication Separations for Truthful Auctions: Breaking the Two-Player + Barrier + + +
+ We study the communication complexity of truthful combinatorial auctions, and +in particular the case where valuations are either subadditive or +single-minded, which we denote with $\mathsf{SubAdd}\cup\mathsf{SingleM}$. We +show that for three bidders with valuations in +$\mathsf{SubAdd}\cup\mathsf{SingleM}$, any deterministic truthful mechanism +that achieves at least a $0.366$-approximation requires $\exp(m)$ +communication. In contrast, a natural extension of [Fei09] yields a +non-truthful $\mathrm{poly}(m)$-communication protocol that achieves a +$\frac{1}{2}$-approximation, demonstrating a gap between the power of truthful +mechanisms and non-truthful protocols for this problem. + Our approach follows the taxation complexity framework laid out in [Dob16b], +but applies this framework in a setting not encompassed by the techniques used +in past work. In particular, the only successful prior application of this +framework uses a reduction to simultaneous protocols which only applies for two +bidders [AKSW20], whereas our three-player lower bounds are stronger than what +can possibly arise from a two-player construction (since a trivial truthful +auction guarantees a $\frac{1}{2}$-approximation for two players). + +
+
+
+
+
+ + ☆ Fermionic Gaussian Testing and Non-Gaussian Measures via Convolution + + +
+ We explore the properties of fermionic convolution defined by fermionic +Gaussian unitary. A key finding is the purity invariance of pure Gaussian +states under this convolution. Leveraging this property, we propose an +efficient protocol to test the fermionic Gaussianity of pure states by using 3 +copies of the input states. Furthermore, we introduce a new family of measures +called ``Non-Gaussian Entropy,'' designed to quantify the fermionic +non-Gaussianity of states. + +
+
+ comment: 7+24 pages +
+
+
+
+
+ + ☆ A SUBSET-SUM Characterisation of the A-Hierarchy + + +
+ The A-hierarchy is a parametric analogue of the polynomial hierarchy in the +context of paramterised complexity theory. We give a new characterisation of +the A-hierarchy in terms of a generalisation of the SUBSET-SUM problem. + +
+
+
+
+
+ + ☆ Undecidability and incompleteness in quantum information theory and + operator algebras + + +
+ We survey a number of incompleteness results in operator algebras stemming +from the recent undecidability result in quantum complexity theory known as +$\operatorname{MIP}^*=\operatorname{RE}$, the most prominent of which is the +G\"odelian refutation of the Connes Embedding Problem. We also discuss the very +recent use of $\operatorname{MIP}^*=\operatorname{RE}$ in refuting the +Aldous-Lyons conjecture in probability theory. + +
+
+ comment: 38 pages. To appear in a special issue of Monatshefte f\"ur + Mathematik celebrating the 100th anniversary of G\"odel's matriculation at + the University of Vienna +
+
+
+
+
+ + ♻ ☆ Pure-Circuit: Tight Inapproximability for PPAD + + +
+ The current state-of-the-art methods for showing inapproximability in PPAD +arise from the $\varepsilon$-Generalized-Circuit ($\varepsilon$-GCircuit) +problem. Rubinstein (2018) showed that there exists a small unknown constant +$\varepsilon$ for which $\varepsilon$-GCircuit is PPAD-hard, and subsequent +work has shown hardness results for other problems in PPAD by using +$\varepsilon$-GCircuit as an intermediate problem. + We introduce Pure-Circuit, a new intermediate problem for PPAD, which can be +thought of as $\varepsilon$-GCircuit pushed to the limit as $\varepsilon +\rightarrow 1$, and we show that the problem is PPAD-complete. We then prove +that $\varepsilon$-GCircuit is PPAD-hard for all $\varepsilon < 0.1$ by a +reduction from Pure-Circuit, and thus strengthen all prior work that has used +GCircuit as an intermediate problem from the existential-constant regime to the +large-constant regime. + We show that stronger inapproximability results can be derived by reducing +directly from Pure-Circuit. In particular, we prove tight inapproximability +results for computing approximate Nash equilibria and approximate +well-supported Nash equilibria in graphical games, for finding approximate +well-supported Nash equilibria in polymatrix games, and for finding approximate +equilibria in threshold games. + +
+
+ comment: This journal version combines the results of two prior conference + papers: "Pure-Circuit: Strong Inapproximability for PPAD" published in FOCS + 2022, and "Tight Inapproximability for Graphical Games" (arXiv:2209.15151) + published in AAAI 2023 +
+
+
+
+
+
+
+
+ + Formal Languages and Automata Theory 3 + +
+
+
+ + ☆ Alternating hierarchy of sushifts defined by nondeterministic + plane-walking automata + + +
+ Plane-walking automata were introduced by Salo & T\"orma to recognise +languages of two-dimensional infinite words (subshifts), the counterpart of +$4$-way finite automata for two-dimensional finite words. We extend the model +to allow for nondeterminism and alternation of quantifiers. We prove that the +recognised subshifts form a strict subclass of sofic subshifts, and that the +classes corresponding to existential and universal nondeterminism are +incomparable and both larger that the deterministic class. We define a +hierarchy of subshifts recognised by plane-walking automata with alternating +quantifiers, which we conjecture to be strict. + +
+
+ comment: 14 pages, submitted to STACS 2025 +
+
+
+
+
+ + ☆ $\mathbb{N}$-polyregular functions arise from well-quasi-orderings + + +
+ A fundamental construction in formal language theory is the Myhill-Nerode +congruence on words, whose finitedness characterizes regular language. This +construction was generalized to functions from $\Sigma^*$ to $\mathbb{Z}$ by +Colcombet, Dou\'eneau-Tabot, and Lopez to characterize the class of so-called +$\mathbb{Z}$-polyregular functions. In this paper, we relax the notion of +equivalence relation to quasi-ordering in order to study the class of +$\mathbb{N}$-polyregular functions, that plays the role of +$\mathbb{Z}$-polyregular functions among functions from $\Sigma^*$ to +$\mathbb{N}$. The analogue of having a finite index is then being a +well-quasi-ordering. This provides a canonical object to describe +$\mathbb{N}$-polyregular functions, together with a powerful new +characterization of this class. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.02232 +
+
+
+
+
+ + ♻ ☆ Attack Tree Generation via Process Mining + + +
+ Attack Trees are a graphical model of security used to study threat +scenarios. While visually appealing and supported by solid theories and +effective tools, one of their main drawbacks remains the amount of effort +required by security experts to design them from scratch. This work aims to +remedy this by providing a method for the automatic generation of Attack Trees +from attack logs. The main original feature of our approach w.r.t existing ones +is the use of Process Mining algorithms to synthesize Attack Trees, which allow +users to customize the way a set of logs are summarized as an Attack Tree, for +example by discarding statistically irrelevant events. Our approach is +supported by a prototype that, apart from the derivation and translation of the +model, provides the user with an Attack Tree in the RisQFLan format, a tool +used for quantitative risk modeling and analysis with Attack Trees. We +illustrate our approach with the case study of attacks on a communication +protocol, produced by a state-of-the-art protocol analyzer. + +
+
+
+
+
+
+
+
+ + Logic in Computer Science 6 + +
+
+
+ + ☆ Reasoning Around Paradox with Grounded Deduction + + +
+ How can we reason around logical paradoxes without falling into them? This +paper introduces grounded deduction or GD, a Kripke-inspired approach to +first-order logic and arithmetic that is neither classical nor intuitionistic, +but nevertheless appears both pragmatically usable and intuitively justifiable. +GD permits the direct expression of unrestricted recursive definitions - +including paradoxical ones such as 'L := not L' - while adding dynamic typing +premises to certain inference rules so that such paradoxes do not lead to +inconsistency. This paper constitutes a preliminary development and +investigation of grounded deduction, to be extended with further elaboration +and deeper analysis of its intriguing properties. + +
+
+
+
+
+ + ☆ Duality theory in linear optimization and its extensions -- formally + verified + + +
+ Farkas established that a system of linear inequalities has a solution if and +only if we cannot obtain a contradiction by taking a linear combination of the +inequalities. We state and formally prove several Farkas-like theorems over +linearly ordered fields in Lean 4. Furthermore, we extend duality theory to the +case when some coefficients are allowed to take ``infinite values''. + +
+
+ comment: Code: https://github.com/madvorak/duality/tree/v2.0.0 +
+
+
+
+
+ + ☆ A SUBSET-SUM Characterisation of the A-Hierarchy + + +
+ The A-hierarchy is a parametric analogue of the polynomial hierarchy in the +context of paramterised complexity theory. We give a new characterisation of +the A-hierarchy in terms of a generalisation of the SUBSET-SUM problem. + +
+
+
+
+
+ + ☆ Handling expression evaluation under interference + + +
+ Hoare-style inference rules for program constructs permit the copying of +expressions and tests from program text into logical contexts. It is known that +this requires care even for sequential programs but further issues arise for +concurrent programs because of potential interference to the values of +variables. The "rely-guarantee" approach does tackle the issue of recording +acceptable interference and offers a way to provide safe inference rules. This +paper shows how the algebraic presentation of rely-guarantee ideas can clarify +and formalise the conditions for safely re-using expressions and tests from +program text in logical contexts for reasoning about programs. + +
+
+ comment: 17 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ A rewriting-logic-with-SMT-based formal analysis and parameter synthesis + framework for parametric time Petri nets + + +
+ This paper presents a concrete and a symbolic rewriting logic semantics for +parametric time Petri nets with inhibitor arcs (PITPNs), a flexible model of +timed systems where parameters are allowed in firing bounds. We prove that our +semantics is bisimilar to the "standard" semantics of PITPNs. This allows us to +use the rewriting logic tool Maude, combined with SMT solving, to provide sound +and complete formal analyses for PITPNs. We develop and implement a new general +folding approach for symbolic reachability, so that Maude-with-SMT reachability +analysis terminates whenever the parametric state-class graph of the PITPN is +finite. Our work opens up the possibility of using the many formal analysis +capabilities of Maude -- including full LTL model checking, analysis with +user-defined analysis strategies, and even statistical model checking -- for +such nets. We illustrate this by explaining how almost all formal analysis and +parameter synthesis methods supported by the state-of-the-art PITPN tool Romeo +can be performed using Maude with SMT. In addition, we also support analysis +and parameter synthesis from parametric initial markings, as well as full LTL +model checking and analysis with user-defined execution strategies. Experiments +show that our methods outperform Romeo in many cases. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.08929 +
+
+
+
+
+ + ♻ ☆ From Proof-theoretic Validity to Base-extension Semantics for + Intuitionistic Propositional Logic + + +
+ Proof-theoretic semantics (P-tS) is the approach to meaning in logic based on +'proof' (as opposed to 'truth'). There are two major approaches to P-tS: +proof-theoretic validity (P-tV) and base-extension semantics (B-eS). The former +is a semantics of arguments, and the latter is a semantics of logical +constants. This paper demonstrates that the B-eS for intuitionistic +propositional logic (IPL) encapsulates the declarative content of a version of +P-tV based on the elimination rules. This explicates how the B-eS for IPL +works, and shows the completeness of this version of P-tV. + +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`