diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..ed6f35db --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-09-13T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.09030v1","updated":"2024-09-13T17:55:58Z","published":"2024-09-13T17:55:58Z","title":"Agents in Software Engineering: Survey, Landscape, and Vision","summary":" In recent years, Large Language Models (LLMs) have achieved remarkable\nsuccess and have been widely used in various downstream tasks, especially in\nthe tasks of the software engineering (SE) field. We find that many studies\ncombining LLMs with SE have employed the concept of agents either explicitly or\nimplicitly. However, there is a lack of an in-depth survey to sort out the\ndevelopment context of existing works, analyze how existing works combine the\nLLM-based agent technologies to optimize various tasks, and clarify the\nframework of LLM-based agents in SE. In this paper, we conduct the first survey\nof the studies on combining LLM-based agents with SE and present a framework of\nLLM-based agents in SE which includes three key modules: perception, memory,\nand action. We also summarize the current challenges in combining the two\nfields and propose future opportunities in response to existing challenges. We\nmaintain a GitHub repository of the related papers at:\nhttps://github.com/DeepSoftwareAnalytics/Awesome-Agent4SE.\n","authors":["Yanxian Huang","Wanjun Zhong","Ensheng Shi","Min Yang","Jiachi Chen","Hui Li","Yuchi Ma","Qianxiang Wang","Zibin Zheng","Yanlin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.09030v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.09013v1","updated":"2024-09-13T17:41:12Z","published":"2024-09-13T17:41:12Z","title":"AI-LieDar: Examine the Trade-off Between Utility and Truthfulness in LLM\n Agents","summary":" To be safely and successfully deployed, LLMs must simultaneously satisfy\ntruthfulness and utility goals. Yet, often these two goals compete (e.g., an AI\nagent assisting a used car salesman selling a car with flaws), partly due to\nambiguous or misleading user instructions. We propose AI-LieDar, a framework to\nstudy how LLM-based agents navigate scenarios with utility-truthfulness\nconflicts in a multi-turn interactive setting. We design a set of realistic\nscenarios where language agents are instructed to achieve goals that are in\nconflict with being truthful during a multi-turn conversation with simulated\nhuman agents. To evaluate the truthfulness at large scale, we develop a\ntruthfulness detector inspired by psychological literature to assess the\nagents' responses. Our experiment demonstrates that all models are truthful\nless than 50% of the time, although truthfulness and goal achievement (utility)\nrates vary across models. We further test the steerability of LLMs towards\ntruthfulness, finding that models follow malicious instructions to deceive, and\neven truth-steered models can still lie. These findings reveal the complex\nnature of truthfulness in LLMs and underscore the importance of further\nresearch to ensure the safe and reliable deployment of LLMs and AI agents.\n","authors":["Zhe Su","Xuhui Zhou","Sanketh Rangreji","Anubha Kabra","Julia Mendelsohn","Faeze Brahman","Maarten Sap"],"pdf_url":"https://arxiv.org/pdf/2409.09013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09009v1","updated":"2024-09-13T17:38:03Z","published":"2024-09-13T17:38:03Z","title":"Optimizing Rare Word Accuracy in Direct Speech Translation with a\n Retrieval-and-Demonstration Approach","summary":" Direct speech translation (ST) models often struggle with rare words.\nIncorrect translation of these words can have severe consequences, impacting\ntranslation quality and user trust. While rare word translation is inherently\nchallenging for neural models due to sparse learning signals, real-world\nscenarios often allow access to translations of past recordings on similar\ntopics. To leverage these valuable resources, we propose a\nretrieval-and-demonstration approach to enhance rare word translation accuracy\nin direct ST models. First, we adapt existing ST models to incorporate\nretrieved examples for rare word translation, which allows the model to benefit\nfrom prepended examples, similar to in-context learning. We then develop a\ncross-modal (speech-to-speech, speech-to-text, text-to-text) retriever to\nlocate suitable examples. We demonstrate that standard ST models can be\neffectively adapted to leverage examples for rare word translation, improving\nrare word translation accuracy over the baseline by 17.6% with gold examples\nand 8.5% with retrieved examples. Moreover, our speech-to-speech retrieval\napproach outperforms other modalities and exhibits higher robustness to unseen\nspeakers. Our code is publicly available\n(https://github.com/SiqiLii/Retrieve-and-Demonstration-ST).\n","authors":["Siqi Li","Danni Liu","Jan Niehues"],"pdf_url":"https://arxiv.org/pdf/2409.09009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09001v1","updated":"2024-09-13T17:31:09Z","published":"2024-09-13T17:31:09Z","title":"E2MoCase: A Dataset for Emotional, Event and Moral Observations in News\n Articles on High-impact Legal Cases","summary":" The way media reports on legal cases can significantly shape public opinion,\noften embedding subtle biases that influence societal views on justice and\nmorality. Analyzing these biases requires a holistic approach that captures the\nemotional tone, moral framing, and specific events within the narratives. In\nthis work we introduce E2MoCase, a novel dataset designed to facilitate the\nintegrated analysis of emotions, moral values, and events within legal\nnarratives and media coverage. By leveraging advanced models for emotion\ndetection, moral value identification, and event extraction, E2MoCase offers a\nmulti-dimensional perspective on how legal cases are portrayed in news\narticles.\n","authors":["Candida M. Greco","Lorenzo Zangari","Davide Picca","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2409.09001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05161v2","updated":"2024-09-13T17:24:52Z","published":"2024-05-08T15:54:12Z","title":"Motion Capture Analysis of Verb and Adjective Types in Austrian Sign\n Language","summary":" Across a number of sign languages, temporal and spatial characteristics of\ndominant hand articulation are used to express semantic and grammatical\nfeatures. In this study of Austrian Sign Language (\\\"Osterreichische\nGeb\\\"ardensprache, or \\\"OGS), motion capture data of four Deaf signers is used\nto quantitatively characterize the kinematic parameters of sign production in\nverbs and adjectives. We investigate (1) the difference in production between\nverbs involving a natural endpoint (telic verbs; e.g. arrive) and verbs lacking\nan endpoint (atelic verbs; e.g. analyze), and (2) adjective signs in\nintensified vs. non-intensified (plain) forms. Motion capture data analysis\nusing linear-mixed effects models (LME) indicates that both the endpoint\nmarking in verbs, as well as marking of intensification in adjectives, are\nexpressed by movement modulation in \\\"OGS. While the semantic distinction\nbetween verb types (telic/atelic) is marked by higher peak velocity and shorter\nduration for telic signs compared to atelic ones, the grammatical distinction\n(intensification) in adjectives is expressed by longer duration for intensified\ncompared to non-intensified adjectives. The observed individual differences of\nsigners might be interpreted as personal signing style.\n","authors":["Julia Krebs","Evie Malaia","Ronnie B. Wilbur","Isabella Fessl","Hans-Peter Wiesinger","Hermann Schwameder","Dietmar Roehm"],"pdf_url":"https://arxiv.org/pdf/2405.05161v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.06468v3","updated":"2024-09-13T16:44:17Z","published":"2024-05-10T13:27:32Z","title":"Pseudo-Prompt Generating in Pre-trained Vision-Language Models for\n Multi-Label Medical Image Classification","summary":" The task of medical image recognition is notably complicated by the presence\nof varied and multiple pathological indications, presenting a unique challenge\nin multi-label classification with unseen labels. This complexity underlines\nthe need for computer-aided diagnosis methods employing multi-label zero-shot\nlearning. Recent advancements in pre-trained vision-language models (VLMs) have\nshowcased notable zero-shot classification abilities on medical images.\nHowever, these methods have limitations on leveraging extensive pre-trained\nknowledge from broader image datasets, and often depend on manual prompt\nconstruction by expert radiologists. By automating the process of prompt\ntuning, prompt learning techniques have emerged as an efficient way to adapt\nVLMs to downstream tasks. Yet, existing CoOp-based strategies fall short in\nperforming class-specific prompts on unseen categories, limiting\ngeneralizability in fine-grained scenarios. To overcome these constraints, we\nintroduce a novel prompt generation approach inspirited by text generation in\nnatural language processing (NLP). Our method, named Pseudo-Prompt Generating\n(PsPG), capitalizes on the priori knowledge of multi-modal features. Featuring\na RNN-based decoder, PsPG autoregressively generates class-tailored embedding\nvectors, i.e., pseudo-prompts. Comparative evaluations on various multi-label\nchest radiograph datasets affirm the superiority of our approach against\nleading medical vision-language and multi-label prompt learning methods. The\nsource code is available at https://github.com/fallingnight/PsPG\n","authors":["Yaoqin Ye","Junjie Zhang","Hongwei Shi"],"pdf_url":"https://arxiv.org/pdf/2405.06468v3.pdf","comment":"Accepted by PRCV 2024"},{"id":"http://arxiv.org/abs/2409.08963v1","updated":"2024-09-13T16:29:25Z","published":"2024-09-13T16:29:25Z","title":"Safeguarding Decentralized Social Media: LLM Agents for Automating\n Community Rule Compliance","summary":" Ensuring content compliance with community guidelines is crucial for\nmaintaining healthy online social environments. However, traditional\nhuman-based compliance checking struggles with scaling due to the increasing\nvolume of user-generated content and a limited number of moderators. Recent\nadvancements in Natural Language Understanding demonstrated by Large Language\nModels unlock new opportunities for automated content compliance verification.\nThis work evaluates six AI-agents built on Open-LLMs for automated rule\ncompliance checking in Decentralized Social Networks, a challenging environment\ndue to heterogeneous community scopes and rules. Analyzing over 50,000 posts\nfrom hundreds of Mastodon servers, we find that AI-agents effectively detect\nnon-compliant content, grasp linguistic subtleties, and adapt to diverse\ncommunity contexts. Most agents also show high inter-rater reliability and\nconsistency in score justification and suggestions for compliance. Human-based\nevaluation with domain experts confirmed the agents' reliability and\nusefulness, rendering them promising tools for semi-automated or\nhuman-in-the-loop content moderation systems.\n","authors":["Lucio La Cava","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2409.08963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08936v1","updated":"2024-09-13T15:55:15Z","published":"2024-09-13T15:55:15Z","title":"SynSUM -- Synthetic Benchmark with Structured and Unstructured Medical\n Records","summary":" We present the SynSUM benchmark, a synthetic dataset linking unstructured\nclinical notes to structured background variables. The dataset consists of\n10,000 artificial patient records containing tabular variables (like symptoms,\ndiagnoses and underlying conditions) and related notes describing the fictional\npatient encounter in the domain of respiratory diseases. The tabular portion of\nthe data is generated through a Bayesian network, where both the causal\nstructure between the variables and the conditional probabilities are proposed\nby an expert based on domain knowledge. We then prompt a large language model\n(GPT-4o) to generate a clinical note related to this patient encounter,\ndescribing the patient symptoms and additional context. The SynSUM dataset is\nprimarily designed to facilitate research on clinical information extraction in\nthe presence of tabular background variables, which can be linked through\ndomain knowledge to concepts of interest to be extracted from the text - the\nsymptoms, in the case of SynSUM. Secondary uses include research on the\nautomation of clinical reasoning over both tabular data and text, causal effect\nestimation in the presence of tabular and/or textual confounders, and\nmulti-modal synthetic data generation. The dataset can be downloaded from\nhttps://github.com/prabaey/SynSUM.\n","authors":["Paloma Rabaey","Henri Arno","Stefan Heytens","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2409.08936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08907v1","updated":"2024-09-13T15:20:18Z","published":"2024-09-13T15:20:18Z","title":"Affective Computing Has Changed: The Foundation Model Disruption","summary":" The dawn of Foundation Models has on the one hand revolutionised a wide range\nof research problems, and, on the other hand, democratised the access and use\nof AI-based tools by the general public. We even observe an incursion of these\nmodels into disciplines related to human psychology, such as the Affective\nComputing domain, suggesting their affective, emerging capabilities. In this\nwork, we aim to raise awareness of the power of Foundation Models in the field\nof Affective Computing by synthetically generating and analysing multimodal\naffective data, focusing on vision, linguistics, and speech (acoustics). We\nalso discuss some fundamental problems, such as ethical issues and regulatory\naspects, related to the use of Foundation Models in this research area.\n","authors":["Björn Schuller","Adria Mallol-Ragolta","Alejandro Peña Almansa","Iosif Tsangko","Mostafa M. Amin","Anastasia Semertzidou","Lukas Christ","Shahin Amiriparian"],"pdf_url":"https://arxiv.org/pdf/2409.08907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.04674v2","updated":"2024-09-13T15:00:45Z","published":"2021-08-10T13:25:29Z","title":"Natural Language Processing with Commonsense Knowledge: A Survey","summary":" Commonsense knowledge is essential for advancing natural language processing\n(NLP) by enabling models to engage in human-like reasoning, which requires a\ndeeper understanding of context and often involves making inferences based on\nimplicit external knowledge. This paper explores the integration of commonsense\nknowledge into various NLP tasks. We begin by reviewing prominent commonsense\nknowledge bases and then discuss the benchmarks used to evaluate the\ncommonsense reasoning capabilities of NLP models, particularly language models.\nFurthermore, we highlight key methodologies for incorporating commonsense\nknowledge and their applications across different NLP tasks. The paper also\nexamines the challenges and emerging trends in enhancing NLP systems with\ncommonsense reasoning. All literature referenced in this survey can be accessed\nvia our GitHub repository: https://github.com/yuboxie/awesome-commonsense.\n","authors":["Yubo Xie","Zonghui Liu","Zongyang Ma","Fanyuan Meng","Yan Xiao","Fahui Miao","Pearl Pu"],"pdf_url":"https://arxiv.org/pdf/2108.04674v2.pdf","comment":"20 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2404.01903v2","updated":"2024-09-13T14:56:46Z","published":"2024-04-02T12:44:44Z","title":"Understanding How CodeLLMs (Mis)Predict Types with Activation Steering","summary":" CodeLLMs are transforming software development as we know it. This is\nespecially true for tasks where rule-based approaches fall short, like type\nprediction. The type prediction task consists in adding a new type annotation\nto a partially typed program, such that the resulting program is closer to\nbeing fully typed. The intractability of rule-based approaches and high cost of\nmanual annotation make CodeLLMs an attractive solution to the problem. However,\nCodeLLMs are still far from being deployed on the large-scale due to doubts\nsurrounding their reliability.\n To shed some light on how CodeLLMs approach type prediction, we investigate\nwhat happens when a model mispredicts a type. We show that by applying\nsemantics-preserving edits to code, CodeLLMs are eventually misled into\nmispredicting type annotations. However, by leveraging activation steering we\nare able to \"steer\" the model back to the correct prediction, making models\nmore robust against semantically irrelevant prompt features. We show that\nsteering achieves comparable performance to fine-tuning directly on the type\nprediction task. Furthermore, we find that steering vectors computed from\nPython code are effective at correcting TypeScript mispredictions, and vice\nversa. To our knowledge, this is the first evidence of its kind to suggest that\nCodeLLMs learn task representations that transfer across languages.\n","authors":["Francesca Lucchetti","Arjun Guha"],"pdf_url":"https://arxiv.org/pdf/2404.01903v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.08887v1","updated":"2024-09-13T14:54:37Z","published":"2024-09-13T14:54:37Z","title":"Visual Language Tracking with Multi-modal Interaction: A Robust\n Benchmark","summary":" Visual Language Tracking (VLT) enhances tracking by mitigating the\nlimitations of relying solely on the visual modality, utilizing high-level\nsemantic information through language. This integration of the language enables\nmore advanced human-machine interaction. The essence of interaction is\ncognitive alignment, which typically requires multiple information exchanges,\nespecially in the sequential decision-making process of VLT. However, current\nVLT benchmarks do not account for multi-round interactions during tracking.\nThey provide only an initial text and bounding box (bbox) in the first frame,\nwith no further interaction as tracking progresses, deviating from the original\nmotivation of the VLT task. To address these limitations, we propose a novel\nand robust benchmark, VLT-MI (Visual Language Tracking with Multi-modal\nInteraction), which introduces multi-round interaction into the VLT task for\nthe first time. (1) We generate diverse, multi-granularity texts for\nmulti-round, multi-modal interaction based on existing mainstream VLT\nbenchmarks using DTLLM-VLT, leveraging the world knowledge of LLMs. (2) We\npropose a new VLT interaction paradigm that achieves multi-round interaction\nthrough text updates and object recovery. When multiple tracking failures\noccur, we provide the tracker with more aligned texts and corrected bboxes\nthrough interaction, thereby expanding the scope of VLT downstream tasks. (3)\nWe conduct comparative experiments on both traditional VLT benchmarks and\nVLT-MI, evaluating and analyzing the accuracy and robustness of trackers under\nthe interactive paradigm. This work offers new insights and paradigms for the\nVLT task, enabling a fine-grained evaluation of multi-modal trackers. We\nbelieve this approach can be extended to additional datasets in the future,\nsupporting broader evaluations and comparisons of video-language model\ncapabilities.\n","authors":["Xuchen Li","Shiyu Hu","Xiaokun Feng","Dailing Zhang","Meiqi Wu","Jing Zhang","Kaiqi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.08887v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2407.06650v2","updated":"2024-09-13T14:39:33Z","published":"2024-07-09T08:21:40Z","title":"An Automatic Quality Metric for Evaluating Simultaneous Interpretation","summary":" Simultaneous interpretation (SI), the translation of one language to another\nin real time, starts translation before the original speech has finished. Its\nevaluation needs to consider both latency and quality. This trade-off is\nchallenging especially for distant word order language pairs such as English\nand Japanese. To handle this word order gap, interpreters maintain the word\norder of the source language as much as possible to keep up with original\nlanguage to minimize its latency while maintaining its quality, whereas in\ntranslation reordering happens to keep fluency in the target language. This\nmeans outputs synchronized with the source language are desirable based on the\nreal SI situation, and it's a key for further progress in computational SI and\nsimultaneous machine translation (SiMT). In this work, we propose an automatic\nevaluation metric for SI and SiMT focusing on word order synchronization. Our\nevaluation metric is based on rank correlation coefficients, leveraging\ncross-lingual pre-trained language models. Our experimental results on\nNAIST-SIC-Aligned and JNPC showed our metrics' effectiveness to measure word\norder synchronization between source and target language.\n","authors":["Mana Makinae","Katsuhito Sudoh","Mararu Yamada","Satoshi Nakamura"],"pdf_url":"https://arxiv.org/pdf/2407.06650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08872v1","updated":"2024-09-13T14:35:47Z","published":"2024-09-13T14:35:47Z","title":"Exploring the Impact of Data Quantity on ASR in Extremely Low-resource\n Languages","summary":" This study investigates the efficacy of data augmentation techniques for\nlow-resource automatic speech recognition (ASR), focusing on two endangered\nAustronesian languages, Amis and Seediq. Recognizing the potential of\nself-supervised learning (SSL) in low-resource settings, we explore the impact\nof data volume on the continued pre-training of SSL models. We propose a novel\ndata-selection scheme leveraging a multilingual corpus to augment the limited\ntarget language data. This scheme utilizes a language classifier to extract\nutterance embeddings and employs one-class classifiers to identify utterances\nphonetically and phonologically proximate to the target languages. Utterances\nare ranked and selected based on their decision scores, ensuring the inclusion\nof highly relevant data in the SSL-ASR pipeline. Our experimental results\ndemonstrate the effectiveness of this approach, yielding substantial\nimprovements in ASR performance for both Amis and Seediq. These findings\nunderscore the feasibility and promise of data augmentation through\ncross-lingual transfer learning for low-resource language ASR.\n","authors":["Yao-Fei Cheng","Li-Wei Chen","Hung-Shin Lee","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14880v2","updated":"2024-09-13T14:06:19Z","published":"2022-11-27T16:31:33Z","title":"Combining Data Generation and Active Learning for Low-Resource Question\n Answering","summary":" Neural approaches have become very popular in Question Answering (QA),\nhowever, they require a large amount of annotated data. In this work, we\npropose a novel approach that combines data augmentation via question-answer\ngeneration with Active Learning to improve performance in low-resource\nsettings, where the target domains are diverse in terms of difficulty and\nsimilarity to the source domain. We also investigate Active Learning for\nquestion answering in different stages, overall reducing the annotation effort\nof humans. For this purpose, we consider target domains in realistic settings,\nwith an extremely low amount of annotated samples but with many unlabeled\ndocuments, which we assume can be obtained with little effort. Additionally, we\nassume a sufficient amount of labeled data from the source domain being\navailable. We perform extensive experiments to find the best setup for\nincorporating domain experts. Our findings show that our novel approach, where\nhumans are incorporated in a data generation approach, boosts performance in\nthe low-resource, domain-specific setting, allowing for low-labeling-effort\nquestion answering systems in new, specialized domains. They further\ndemonstrate how human annotation affects the performance of QA depending on the\nstage it is performed.\n","authors":["Maximilian Kimmich","Andrea Bartezzaghi","Jasmina Bogojeska","Cristiano Malossi","Ngoc Thang Vu"],"pdf_url":"https://arxiv.org/pdf/2211.14880v2.pdf","comment":"ICANN 2024"},{"id":"http://arxiv.org/abs/2409.08846v1","updated":"2024-09-13T14:04:39Z","published":"2024-09-13T14:04:39Z","title":"FP-VEC: Fingerprinting Large Language Models via Efficient Vector\n Addition","summary":" Training Large Language Models (LLMs) requires immense computational power\nand vast amounts of data. As a result, protecting the intellectual property of\nthese models through fingerprinting is essential for ownership authentication.\nWhile adding fingerprints to LLMs through fine-tuning has been attempted, it\nremains costly and unscalable. In this paper, we introduce FP-VEC, a pilot\nstudy on using fingerprint vectors as an efficient fingerprinting method for\nLLMs. Our approach generates a fingerprint vector that represents a\nconfidential signature embedded in the model, allowing the same fingerprint to\nbe seamlessly incorporated into an unlimited number of LLMs via vector\naddition. Results on several LLMs show that FP-VEC is lightweight by running on\nCPU-only devices for fingerprinting, scalable with a single training and\nunlimited fingerprinting process, and preserves the model's normal behavior.\nThe project page is available at https://fingerprintvector.github.io .\n","authors":["Zhenhua Xu","Wenpeng Xing","Zhebo Wang","Chang Hu","Chen Jie","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2409.08846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08845v1","updated":"2024-09-13T14:03:49Z","published":"2024-09-13T14:03:49Z","title":"AIPO: Improving Training Objective for Iterative Preference Optimization","summary":" Preference Optimization (PO), is gaining popularity as an alternative choice\nof Proximal Policy Optimization (PPO) for aligning Large Language Models\n(LLMs). Recent research on aligning LLMs iteratively with synthetic or\npartially synthetic data shows promising results in scaling up PO training for\nboth academic settings and proprietary trained models such as Llama3. Despite\nits success, our study shows that the length exploitation issue present in PO\nis even more severe in Iterative Preference Optimization (IPO) due to the\niterative nature of the process. In this work, we study iterative preference\noptimization with synthetic data. We share the findings and analysis along the\nway of building the iterative preference optimization pipeline. More\nspecifically, we discuss the length exploitation issue during iterative\npreference optimization and propose our training objective for iterative\npreference optimization, namely Agreement-aware Iterative Preference\nOptimization (AIPO). To demonstrate the effectiveness of our method, we conduct\ncomprehensive experiments and achieve state-of-the-art performance on MT-Bench,\nAlpacaEval 2.0, and Arena-Hard. Our implementation and model checkpoints will\nbe made available at https://github.com/bytedance/AIPO.\n","authors":["Yaojie Shen","Xinyao Wang","Yulei Niu","Ying Zhou","Lexin Tang","Libo Zhang","Fan Chen","Longyin Wen"],"pdf_url":"https://arxiv.org/pdf/2409.08845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11850v4","updated":"2024-09-13T13:53:58Z","published":"2023-01-27T16:56:24Z","title":"Predicting Sentence-Level Factuality of News and Bias of Media Outlets","summary":" Automated news credibility and fact-checking at scale require accurately\npredicting news factuality and media bias. This paper introduces a large\nsentence-level dataset, titled \"FactNews\", composed of 6,191 sentences expertly\nannotated according to factuality and media bias definitions proposed by\nAllSides. We use FactNews to assess the overall reliability of news sources, by\nformulating two text classification problems for predicting sentence-level\nfactuality of news reporting and bias of media outlets. Our experiments\ndemonstrate that biased sentences present a higher number of words compared to\nfactual sentences, besides having a predominance of emotions. Hence, the\nfine-grained analysis of subjectivity and impartiality of news articles\nprovided promising results for predicting the reliability of media outlets.\nFinally, due to the severity of fake news and political polarization in Brazil,\nand the lack of research for Portuguese, both dataset and baseline were\nproposed for Brazilian Portuguese.\n","authors":["Francielle Vargas","Kokil Jaidka","Thiago A. S. Pardo","Fabrício Benevenuto"],"pdf_url":"https://arxiv.org/pdf/2301.11850v4.pdf","comment":"Proceedings of the 14th International Conference on Recent Advances\n in Natural Language Processing (RANLP 2023).\n https://aclanthology.org/2023.ranlp-1.127"},{"id":"http://arxiv.org/abs/2409.08813v1","updated":"2024-09-13T13:24:52Z","published":"2024-09-13T13:24:52Z","title":"Your Weak LLM is Secretly a Strong Teacher for Alignment","summary":" The burgeoning capabilities of large language models (LLMs) have underscored\nthe need for alignment to ensure these models act in accordance with human\nvalues and intentions. Existing alignment frameworks present constraints either\nin the form of expensive human effort or high computational costs. This paper\nexplores a promising middle ground, where we employ a weak LLM that is\nsignificantly less resource-intensive than top-tier models, yet offers more\nautomation than purely human feedback. We present a systematic study to\nevaluate and understand weak LLM's ability to generate feedback for alignment.\nOur empirical findings demonstrate that weak LLMs can provide feedback that\nrivals or even exceeds that of fully human-annotated data. Our study indicates\na minimized impact of model size on feedback efficacy, shedding light on a\nscalable and sustainable alignment strategy. To deepen our understanding of\nalignment under weak LLM feedback, we conduct a series of qualitative and\nquantitative analyses, offering novel insights into the quality discrepancies\nbetween human feedback vs. weak LLM feedback.\n","authors":["Leitian Tao","Yixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.08813v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2409.08805v1","updated":"2024-09-13T13:13:39Z","published":"2024-09-13T13:13:39Z","title":"Exploring SSL Discrete Tokens for Multilingual ASR","summary":" With the advancement of Self-supervised Learning (SSL) in speech-related\ntasks, there has been growing interest in utilizing discrete tokens generated\nby SSL for automatic speech recognition (ASR), as they offer faster processing\ntechniques. However, previous studies primarily focused on multilingual ASR\nwith Fbank features or English ASR with discrete tokens, leaving a gap in\nadapting discrete tokens for multilingual ASR scenarios. This study presents a\ncomprehensive comparison of discrete tokens generated by various leading SSL\nmodels across multiple language domains. We aim to explore the performance and\nefficiency of speech discrete tokens across multiple language domains for both\nmonolingual and multilingual ASR scenarios. Experimental results demonstrate\nthat discrete tokens achieve comparable results against systems trained on\nFbank features in ASR tasks across seven language domains with an average word\nerror rate (WER) reduction of 0.31% and 1.76% absolute (2.80% and 15.70%\nrelative) on dev and test sets respectively, with particularly WER reduction of\n6.82% absolute (41.48% relative) on the Polish test set.\n","authors":["Mingyu Cui","Daxin Tan","Yifan Yang","Dingdong Wang","Huimeng Wang","Xiao Chen","Xie Chen","Xunying Liu"],"pdf_url":"https://arxiv.org/pdf/2409.08805v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.08797v1","updated":"2024-09-13T13:01:09Z","published":"2024-09-13T13:01:09Z","title":"Exploring SSL Discrete Speech Features for Zipformer-based Contextual\n ASR","summary":" Self-supervised learning (SSL) based discrete speech representations are\nhighly compact and domain adaptable. In this paper, SSL discrete speech\nfeatures extracted from WavLM models are used as additional cross-utterance\nacoustic context features in Zipformer-Transducer ASR systems. The efficacy of\nreplacing Fbank features with discrete token features for modelling either\ncross-utterance contexts (from preceding and future segments), or current\nutterance's internal contexts alone, or both at the same time, are demonstrated\nthoroughly on the Gigaspeech 1000-hr corpus. The best Zipformer-Transducer\nsystem using discrete tokens based cross-utterance context features outperforms\nthe baseline using utterance internal context only with statistically\nsignificant word error rate (WER) reductions of 0.32% to 0.41% absolute (2.78%\nto 3.54% relative) on the dev and test data. The lowest published WER of 11.15%\nand 11.14% were obtained on the dev and test sets. Our work is open-source and\npublicly available at\nhttps://github.com/open-creator/icefall/tree/master/egs/gigaspeech/Context\\_ASR.\n","authors":["Mingyu Cui","Yifan Yang","Jiajun Deng","Jiawen Kang","Shujie Hu","Tianzi Wang","Zhaoqing Li","Shiliang Zhang","Xie Chen","Xunying Liu"],"pdf_url":"https://arxiv.org/pdf/2409.08797v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.08792v1","updated":"2024-09-13T12:55:45Z","published":"2024-09-13T12:55:45Z","title":"Optimizing Ingredient Substitution Using Large Language Models to\n Enhance Phytochemical Content in Recipes","summary":" In the emerging field of computational gastronomy, aligning culinary\npractices with scientifically supported nutritional goals is increasingly\nimportant. This study explores how large language models (LLMs) can be applied\nto optimize ingredient substitutions in recipes, specifically to enhance the\nphytochemical content of meals. Phytochemicals are bioactive compounds found in\nplants, which, based on preclinical studies, may offer potential health\nbenefits. We fine-tuned models, including OpenAI's GPT-3.5, DaVinci, and Meta's\nTinyLlama, using an ingredient substitution dataset. These models were used to\npredict substitutions that enhance phytochemical content and create a\ncorresponding enriched recipe dataset. Our approach improved Hit@1 accuracy on\ningredient substitution tasks, from the baseline 34.53 plus-minus 0.10% to\n38.03 plus-minus 0.28% on the original GISMo dataset, and from 40.24 plus-minus\n0.36% to 54.46 plus-minus 0.29% on a refined version of the same dataset. These\nsubstitutions led to the creation of 1,951 phytochemically enriched ingredient\npairings and 1,639 unique recipes. While this approach demonstrates potential\nin optimizing ingredient substitutions, caution must be taken when drawing\nconclusions about health benefits, as the claims are based on preclinical\nevidence. Future work should include clinical validation and broader datasets\nto further evaluate the nutritional impact of these substitutions. This\nresearch represents a step forward in using AI to promote healthier eating\npractices, providing potential pathways for integrating computational methods\nwith nutritional science.\n","authors":["Luis Rita","Josh Southern","Ivan Laponogov","Kyle Higgins","Kirill Veselkov"],"pdf_url":"https://arxiv.org/pdf/2409.08792v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.08780v1","updated":"2024-09-13T12:36:52Z","published":"2024-09-13T12:36:52Z","title":"Sign Language Sense Disambiguation","summary":" This project explores methods to enhance sign language translation of German\nsign language, specifically focusing on disambiguation of homonyms. Sign\nlanguage is ambiguous and understudied which is the basis for our experiments.\nWe approach the improvement by training transformer-based models on various\nbodypart representations to shift the focus on said bodypart. To determine the\nimpact of, e.g., the hand or mouth representations, we experiment with\ndifferent combinations. The results show that focusing on the mouth increases\nthe performance in small dataset settings while shifting the focus on the hands\nretrieves better results in larger dataset settings. Our results contribute to\nbetter accessibility for non-hearing persons by improving the systems powering\ndigital assistants, enabling a more accurate interaction. The code for this\nproject can be found on GitHub.\n","authors":["Jana Grimm","Miriam Winkler","Oliver Kraus","Tanalp Agustoslu"],"pdf_url":"https://arxiv.org/pdf/2409.08780v1.pdf","comment":"LIMO2024 @ KONVENS 2024, 8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.09895v4","updated":"2024-09-13T12:28:45Z","published":"2024-08-19T11:09:12Z","title":"Performance Law of Large Language Models","summary":" Guided by the belief of the scaling law, large language models (LLMs) have\nachieved impressive performance in recent years. However, scaling law only\ngives a qualitative estimation of loss, which is influenced by various factors\nsuch as model architectures, data distributions, tokenizers, and computation\nprecision. Thus, estimating the real performance of LLMs with different\ntraining settings rather than loss may be quite useful in practical\ndevelopment. In this article, we present an empirical equation named\n\"Performance Law\" to directly predict the MMLU score of an LLM, which is a\nwidely used metric to indicate the general capability of LLMs in real-world\nconversations and applications. Based on only a few key hyperparameters of the\nLLM architecture and the size of training data, we obtain a quite accurate MMLU\nprediction of various LLMs with diverse sizes and architectures developed by\ndifferent organizations in different years. Performance law can be used to\nguide the choice of LLM architecture and the effective allocation of\ncomputational resources without extensive experiments.\n","authors":["Chuhan Wu","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2408.09895v4.pdf","comment":"Personal opinions of the authors"},{"id":"http://arxiv.org/abs/2409.08761v1","updated":"2024-09-13T12:09:20Z","published":"2024-09-13T12:09:20Z","title":"Journalists, Emotions, and the Introduction of Generative AI Chatbots: A\n Large-Scale Analysis of Tweets Before and After the Launch of ChatGPT","summary":" As part of a broader look at the impact of generative AI, this study\ninvestigated the emotional responses of journalists to the release of ChatGPT\nat the time of its launch. By analyzing nearly 1 million Tweets from\njournalists at major U.S. news outlets, we tracked changes in emotional tone\nand sentiment before and after the introduction of ChatGPT in November 2022.\nUsing various computational and natural language processing techniques to\nmeasure emotional shifts in response to ChatGPT's release, we found an increase\nin positive emotion and a more favorable tone post-launch, suggesting initial\noptimism toward AI's potential. This research underscores the pivotal role of\njournalists as interpreters of technological innovation and disruption,\nhighlighting how their emotional reactions may shape public narratives around\nemerging technologies. The study contributes to understanding the intersection\nof journalism, emotion, and AI, offering insights into the broader societal\nimpact of generative AI tools.\n","authors":["Seth C. Lewis","David M. Markowitz","Jon Benedik Bunquin"],"pdf_url":"https://arxiv.org/pdf/2409.08761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08253v2","updated":"2024-09-13T11:22:28Z","published":"2024-09-12T17:50:05Z","title":"The Design of Informative Take-Over Requests for Semi-Autonomous\n Cyber-Physical Systems: Combining Spoken Language and Visual Icons in a\n Drone-Controller Setting","summary":" The question of how cyber-physical systems should interact with human\npartners that can take over control or exert oversight is becoming more\npressing, as these systems are deployed for an ever larger range of tasks.\nDrawing on the literatures on handing over control during semi-autonomous\ndriving and human-robot interaction, we propose a design of a take-over request\nthat combines an abstract pre-alert with an informative TOR: Relevant sensor\ninformation is highlighted on the controller's display, while a spoken message\nverbalizes the reason for the TOR. We conduct our study in the context of a\nsemi-autonomous drone control scenario as our testbed. The goal of our online\nstudy is to assess in more detail what form a language-based TOR should take.\nSpecifically, we compare a full sentence condition to shorter fragments, and\ntest whether the visual highlighting should be done synchronously or\nasynchronously with the speech. Participants showed a higher accuracy in\nchoosing the correct solution with our bi-modal TOR and felt that they were\nbetter able to recognize the critical situation. Using only fragments in the\nspoken message rather than full sentences did not lead to improved accuracy or\nfaster reactions. Also, synchronizing the visual highlighting with the spoken\nmessage did not result in better accuracy and response times were even\nincreased in this condition.\n","authors":["Ashwini Gundappa","Emilia Ellsiepen","Lukas Schmitz","Frederik Wiehr","Vera Demberg"],"pdf_url":"https://arxiv.org/pdf/2409.08253v2.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.08719v1","updated":"2024-09-13T11:10:16Z","published":"2024-09-13T11:10:16Z","title":"Distilling Monolingual and Crosslingual Word-in-Context Representations","summary":" In this study, we propose a method that distils representations of word\nmeaning in context from a pre-trained masked language model in both monolingual\nand crosslingual settings. Word representations are the basis for context-aware\nlexical semantics and unsupervised semantic textual similarity (STS)\nestimation. Different from existing approaches, our method does not require\nhuman-annotated corpora nor updates of the parameters of the pre-trained model.\nThe latter feature is appealing for practical scenarios where the off-the-shelf\npre-trained model is a common asset among different applications. Specifically,\nour method learns to combine the outputs of different hidden layers of the\npre-trained model using self-attention. Our auto-encoder based training only\nrequires an automatically generated corpus. To evaluate the performance of the\nproposed approach, we performed extensive experiments using various benchmark\ntasks. The results on the monolingual tasks confirmed that our representations\nexhibited a competitive performance compared to that of the previous study for\nthe context-aware lexical semantic tasks and outperformed it for STS\nestimation. The results of the crosslingual tasks revealed that the proposed\nmethod largely improved crosslingual word representations of multilingual\npre-trained models.\n","authors":["Yuki Arase","Tomoyuki Kajiwara"],"pdf_url":"https://arxiv.org/pdf/2409.08719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08712v1","updated":"2024-09-13T10:59:24Z","published":"2024-09-13T10:59:24Z","title":"Layerwise Change of Knowledge in Neural Networks","summary":" This paper aims to explain how a deep neural network (DNN) gradually extracts\nnew knowledge and forgets noisy features through layers in forward propagation.\nUp to now, although the definition of knowledge encoded by the DNN has not\nreached a consensus, Previous studies have derived a series of mathematical\nevidence to take interactions as symbolic primitive inference patterns encoded\nby a DNN. We extend the definition of interactions and, for the first time,\nextract interactions encoded by intermediate layers. We quantify and track the\nnewly emerged interactions and the forgotten interactions in each layer during\nthe forward propagation, which shed new light on the learning behavior of DNNs.\nThe layer-wise change of interactions also reveals the change of the\ngeneralization capacity and instability of feature representations of a DNN.\n","authors":["Xu Cheng","Lei Cheng","Zhaoran Peng","Yang Xu","Tian Han","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.08712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08706v1","updated":"2024-09-13T10:48:35Z","published":"2024-09-13T10:48:35Z","title":"L3Cube-IndicQuest: A Benchmark Questing Answering Dataset for Evaluating\n Knowledge of LLMs in Indic Context","summary":" Large Language Models (LLMs) have made significant progress in incorporating\nIndic languages within multilingual models. However, it is crucial to\nquantitatively assess whether these languages perform comparably to globally\ndominant ones, such as English. Currently, there is a lack of benchmark\ndatasets specifically designed to evaluate the regional knowledge of LLMs in\nvarious Indic languages. In this paper, we present the L3Cube-IndicQuest, a\ngold-standard question-answering benchmark dataset designed to evaluate how\nwell multilingual LLMs capture regional knowledge across various Indic\nlanguages. The dataset contains 200 question-answer pairs, each for English and\n19 Indic languages, covering five domains specific to the Indic region. We aim\nfor this dataset to serve as a benchmark, providing ground truth for evaluating\nthe performance of LLMs in understanding and representing knowledge relevant to\nthe Indian context. The IndicQuest can be used for both reference-based\nevaluation and LLM-as-a-judge evaluation. The dataset is shared publicly at\nhttps://github.com/l3cube-pune/indic-nlp .\n","authors":["Pritika Rohera","Chaitrali Ginimav","Akanksha Salunke","Gayatri Sawant","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2409.08706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08692v1","updated":"2024-09-13T10:22:08Z","published":"2024-09-13T10:22:08Z","title":"B4: Towards Optimal Assessment of Plausible Code Solutions with\n Plausible Tests","summary":" Selecting the best code solution from multiple generated ones is an essential\ntask in code generation, which can be achieved by using some reliable\nvalidators (e.g., developer-written test cases) for assistance. Since reliable\ntest cases are not always available and can be expensive to build in practice,\nresearchers propose to automatically generate test cases to assess code\nsolutions. However, when both code solutions and test cases are plausible and\nnot reliable, selecting the best solution becomes challenging. Although some\nheuristic strategies have been proposed to tackle this problem, they lack a\nstrong theoretical guarantee and it is still an open question whether an\noptimal selection strategy exists. Our work contributes in two ways. First, we\nshow that within a Bayesian framework, the optimal selection strategy can be\ndefined based on the posterior probability of the observed passing states\nbetween solutions and tests. The problem of identifying the best solution is\nthen framed as an integer programming problem. Second, we propose an efficient\napproach for approximating this optimal (yet uncomputable) strategy, where the\napproximation error is bounded by the correctness of prior knowledge. We then\nincorporate effective prior knowledge to tailor code generation tasks. Both\ntheoretical and empirical studies confirm that existing heuristics are limited\nin selecting the best solutions with plausible test cases. Our proposed\napproximated optimal strategy B4 significantly surpasses existing heuristics in\nselecting code solutions generated by large language models (LLMs) with\nLLM-generated tests, achieving a relative performance improvement by up to 50%\nover the strongest heuristic and 246% over the random selection in the most\nchallenging scenarios. Our code is publicly available at\nhttps://github.com/ZJU-CTAG/B4.\n","authors":["Mouxiang Chen","Zhongxin Liu","He Tao","Yusu Hong","David Lo","Xin Xia","Jianling Sun"],"pdf_url":"https://arxiv.org/pdf/2409.08692v1.pdf","comment":"accepted by ASE' 24 (full paper)"},{"id":"http://arxiv.org/abs/2409.08680v1","updated":"2024-09-13T09:48:11Z","published":"2024-09-13T09:48:11Z","title":"NEST-RQ: Next Token Prediction for Speech Self-Supervised Pre-Training","summary":" Speech self-supervised pre-training can effectively improve the performance\nof downstream tasks. However, previous self-supervised learning (SSL) methods\nfor speech, such as HuBERT and BEST-RQ, focus on utilizing non-causal encoders\nwith bidirectional context, and lack sufficient support for downstream\nstreaming models. To address this issue, we introduce the next token prediction\nbased speech pre-training method with random-projection quantizer (NEST-RQ).\nNEST-RQ employs causal encoders with only left context and uses next token\nprediction (NTP) as the training task. On the large-scale dataset, compared to\nBEST-RQ, the proposed NEST-RQ achieves comparable performance on non-streaming\nautomatic speech recognition (ASR) and better performance on streaming ASR. We\nalso conduct analytical experiments in terms of the future context size of\nstreaming ASR, the codebook quality of SSL and the model size of the encoder.\nIn summary, the paper demonstrates the feasibility of the NTP in speech SSL and\nprovides empirical evidence and insights for speech SSL research.\n","authors":["Minglun Han","Ye Bai","Chen Shen","Youjia Huang","Mingkun Huang","Zehua Lin","Linhao Dong","Lu Lu","Yuxuan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08680v1.pdf","comment":"5 pages, 2 figures, Work in progress"},{"id":"http://arxiv.org/abs/2404.04167v5","updated":"2024-09-13T09:47:29Z","published":"2024-04-05T15:20:02Z","title":"Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model","summary":" In this study, we introduce CT-LLM, a 2B large language model (LLM) that\nillustrates a pivotal shift towards prioritizing the Chinese language in\ndeveloping LLMs. Uniquely initiated from scratch, CT-LLM diverges from the\nconventional methodology by primarily incorporating Chinese textual data,\nutilizing an extensive corpus of 1,200 billion tokens, including 800 billion\nChinese tokens, 300 billion English tokens, and 100 billion code tokens. This\nstrategic composition facilitates the model's exceptional proficiency in\nunderstanding and processing Chinese, a capability further enhanced through\nalignment techniques. Demonstrating remarkable performance on the CHC-Bench,\nCT-LLM excels in Chinese language tasks, and showcases its adeptness in English\nthrough SFT. This research challenges the prevailing paradigm of training LLMs\npredominantly on English corpora and then adapting them to other languages,\nbroadening the horizons for LLM training methodologies. By open-sourcing the\nfull process of training a Chinese LLM, including a detailed data processing\nprocedure with the obtained Massive Appropriate Pretraining Chinese Corpus\n(MAP-CC), a well-chosen multidisciplinary Chinese Hard Case Benchmark\n(CHC-Bench), and the 2B-size Chinese Tiny LLM (CT-LLM), we aim to foster\nfurther exploration and innovation in both academia and industry, paving the\nway for more inclusive and versatile language models.\n","authors":["Xinrun Du","Zhouliang Yu","Songyang Gao","Ding Pan","Yuyang Cheng","Ziyang Ma","Ruibin Yuan","Xingwei Qu","Jiaheng Liu","Tianyu Zheng","Xinchen Luo","Guorui Zhou","Wenhu Chen","Ge Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.04167v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08664v1","updated":"2024-09-13T09:27:05Z","published":"2024-09-13T09:27:05Z","title":"Investigating Disentanglement in a Phoneme-level Speech Codec for\n Prosody Modeling","summary":" Most of the prevalent approaches in speech prosody modeling rely on learning\nglobal style representations in a continuous latent space which encode and\ntransfer the attributes of reference speech. However, recent work on neural\ncodecs which are based on Residual Vector Quantization (RVQ) already shows\ngreat potential offering distinct advantages. We investigate the prosody\nmodeling capabilities of the discrete space of such an RVQ-VAE model, modifying\nit to operate on the phoneme-level. We condition both the encoder and decoder\nof the model on linguistic representations and apply a global speaker embedding\nin order to factor out both phonetic and speaker information. We conduct an\nextensive set of investigations based on subjective experiments and objective\nmeasures to show that the phoneme-level discrete latent representations\nobtained this way achieves a high degree of disentanglement, capturing\nfine-grained prosodic information that is robust and transferable. The latent\nspace turns out to have interpretable structure with its principal components\ncorresponding to pitch and energy.\n","authors":["Sotirios Karapiperis","Nikolaos Ellinas","Alexandra Vioni","Junkwang Oh","Gunu Jho","Inchul Hwang","Spyros Raptis"],"pdf_url":"https://arxiv.org/pdf/2409.08664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05176v4","updated":"2024-09-13T08:58:47Z","published":"2023-06-08T13:17:06Z","title":"RRWKV: Capturing Long-range Dependencies in RWKV","summary":" Owing to the impressive dot-product attention, the Transformers have been the\ndominant architectures in various natural language processing (NLP) tasks.\nRecently, the Receptance Weighted Key Value (RWKV) architecture follows a\nnon-transformer architecture to eliminate the drawbacks of dot-product\nattention, where memory and computational complexity exhibits quadratic scaling\nwith sequence length. Although RWKV has exploited a linearly tensor-product\nattention mechanism and achieved parallelized computations by deploying the\ntime-sequential mode, it fails to capture long-range dependencies because of\nits limitation on looking back at previous information, compared with full\ninformation obtained by direct interactions in the standard transformer.\nTherefore, the paper devises the Retrospected Receptance Weighted Key Value\n(RRWKV) architecture via incorporating the retrospecting ability into the RWKV\nto effectively absorb information, which maintains memory and computational\nefficiency as well.\n","authors":["Leilei Wang"],"pdf_url":"https://arxiv.org/pdf/2306.05176v4.pdf","comment":"Upon further review, the authors have determined that the conclusions\n presented in the paper are no longer valid or contain errors. As a result, we\n have decided to withdraw the paper to avoid the spread of incorrect findings"},{"id":"http://arxiv.org/abs/2408.10718v2","updated":"2024-09-13T08:09:50Z","published":"2024-08-20T10:40:35Z","title":"CodeJudge-Eval: Can Large Language Models be Good Judges in Code\n Understanding?","summary":" Recent advancements in large language models (LLMs) have showcased impressive\ncode generation capabilities, primarily evaluated through language-to-code\nbenchmarks. However, these benchmarks may not fully capture a model's code\nunderstanding abilities. We introduce CodeJudge-Eval (CJ-Eval), a novel\nbenchmark designed to assess LLMs' code understanding abilities from the\nperspective of code judging rather than code generation. CJ-Eval challenges\nmodels to determine the correctness of provided code solutions, encompassing\nvarious error types and compilation issues. By leveraging a diverse set of\nproblems and a fine-grained judging system, CJ-Eval addresses the limitations\nof traditional benchmarks, including the potential memorization of solutions.\nEvaluation of 12 well-known LLMs on CJ-Eval reveals that even state-of-the-art\nmodels struggle, highlighting the benchmark's ability to probe deeper into\nmodels' code understanding abilities. Our codes and benchmark are available at\n\\url{https://github.com/CodeLLM-Research/CodeJudge-Eval}.\n","authors":["Yuwei Zhao","Ziyang Luo","Yuchen Tian","Hongzhan Lin","Weixiang Yan","Annan Li","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2408.10718v2.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2409.08597v1","updated":"2024-09-13T07:28:47Z","published":"2024-09-13T07:28:47Z","title":"LA-RAG:Enhancing LLM-based ASR Accuracy with Retrieval-Augmented\n Generation","summary":" Recent advancements in integrating speech information into large language\nmodels (LLMs) have significantly improved automatic speech recognition (ASR)\naccuracy. However, existing methods often constrained by the capabilities of\nthe speech encoders under varied acoustic conditions, such as accents. To\naddress this, we propose LA-RAG, a novel Retrieval-Augmented Generation (RAG)\nparadigm for LLM-based ASR. LA-RAG leverages fine-grained token-level speech\ndatastores and a speech-to-speech retrieval mechanism to enhance ASR accuracy\nvia LLM in-context learning (ICL) capabilities. Experiments on Mandarin and\nvarious Chinese dialect datasets demonstrate significant improvements in ASR\naccuracy compared to existing methods, validating the effectiveness of our\napproach, especially in handling accent variations.\n","authors":["Shaojun Li","Hengchao Shang","Daimeng Wei","Jiaxin Guo","Zongyao Li","Xianghui He","Min Zhang","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2409.08597v1.pdf","comment":"submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.08596v1","updated":"2024-09-13T07:28:28Z","published":"2024-09-13T07:28:28Z","title":"Large Language Model Can Transcribe Speech in Multi-Talker Scenarios\n with Versatile Instructions","summary":" Recent advancements in large language models (LLMs) have revolutionized\nvarious domains, bringing significant progress and new opportunities. Despite\nprogress in speech-related tasks, LLMs have not been sufficiently explored in\nmulti-talker scenarios. In this work, we present a pioneering effort to\ninvestigate the capability of LLMs in transcribing speech in multi-talker\nenvironments, following versatile instructions related to multi-talker\nautomatic speech recognition (ASR), target talker ASR, and ASR based on\nspecific talker attributes such as sex, occurrence order, language, and keyword\nspoken. Our approach utilizes WavLM and Whisper encoder to extract\nmulti-faceted speech representations that are sensitive to speaker\ncharacteristics and semantic context. These representations are then fed into\nan LLM fine-tuned using LoRA, enabling the capabilities for speech\ncomprehension and transcription. Comprehensive experiments reveal the promising\nperformance of our proposed system, MT-LLM, in cocktail party scenarios,\nhighlighting the potential of LLM to handle speech-related tasks based on user\ninstructions in such complex settings.\n","authors":["Lingwei Meng","Shujie Hu","Jiawen Kang","Zhaoqing Li","Yuejiao Wang","Wenxuan Wu","Xixin Wu","Xunying Liu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2409.08596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05693v2","updated":"2024-09-13T06:57:01Z","published":"2024-07-08T07:47:30Z","title":"Sub-SA: Strengthen In-context Learning via Submodular Selective\n Annotation","summary":" In-context learning (ICL) leverages in-context examples as prompts for the\npredictions of Large Language Models (LLMs). These prompts play a crucial role\nin achieving strong performance. However, the selection of suitable prompts\nfrom a large pool of labeled examples often entails significant annotation\ncosts. To address this challenge, we propose Sub-SA (Submodular Selective\nAnnotation), a submodule-based selective annotation method. The aim of Sub-SA\nis to reduce annotation costs while improving the quality of in-context\nexamples and minimizing the time consumption of the selection process. In\nSub-SA, we design a submodular function that facilitates effective subset\nselection for annotation and demonstrates the characteristics of monotonically\nand submodularity from the theoretical perspective. Specifically, we propose\nRPR (Reward and Penalty Regularization) to better balance the diversity and\nrepresentativeness of the unlabeled dataset attributed to a reward term and a\npenalty term, respectively. Consequently, the selection for annotations can be\neffectively addressed with a simple yet effective greedy search algorithm based\non the submodular function. Finally, we apply the similarity prompt retrieval\nto get the examples for ICL.\n","authors":["Jian Qian","Miao Sun","Sifan Zhou","Ziyu Zhao","Ruizhi Hun","Patrick Chiang"],"pdf_url":"https://arxiv.org/pdf/2407.05693v2.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2409.08564v1","updated":"2024-09-13T06:34:15Z","published":"2024-09-13T06:34:15Z","title":"Cracking the Code: Multi-domain LLM Evaluation on Real-World\n Professional Exams in Indonesia","summary":" While knowledge evaluation in large language models has predominantly focused\non academic subjects like math and physics, these assessments often fail to\ncapture the practical demands of real-world professions. In this paper, we\nintroduce IndoCareer, a dataset comprising 8,834 multiple-choice questions\ndesigned to evaluate performance in vocational and professional certification\nexams across various fields. With a focus on Indonesia, IndoCareer provides\nrich local contexts, spanning six key sectors: (1) healthcare, (2) insurance\nand finance, (3) creative and design, (4) tourism and hospitality, (5)\neducation and training, and (6) law. Our comprehensive evaluation of 27 large\nlanguage models shows that these models struggle particularly in fields with\nstrong local contexts, such as insurance and finance. Additionally, while using\nthe entire dataset, shuffling answer options generally maintains consistent\nevaluation results across models, but it introduces instability specifically in\nthe insurance and finance sectors.\n","authors":["Fajri Koto"],"pdf_url":"https://arxiv.org/pdf/2409.08564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08561v1","updated":"2024-09-13T06:29:20Z","published":"2024-09-13T06:29:20Z","title":"Expediting and Elevating Large Language Model Reasoning via Hidden\n Chain-of-Thought Decoding","summary":" Large language models (LLMs) have demonstrated remarkable capabilities in\ntasks requiring reasoning and multi-step problem-solving through the use of\nchain-of-thought (CoT) prompting. However, generating the full CoT process\nresults in significantly longer output sequences, leading to increased\ncomputational costs and latency during inference. To address this challenge, we\npropose a novel approach to compress the CoT process through semantic\nalignment, enabling more efficient decoding while preserving the benefits of\nCoT reasoning. Our method introduces an auxiliary CoT model that learns to\ngenerate and compress the full thought process into a compact special token\nrepresentation semantically aligned with the original CoT output. This\ncompressed representation is then integrated into the input of the Hidden\nChain-of-Thought (HCoT) model. The training process follows a two-stage\nprocedure: First, the CoT model is optimized to generate the compressed token\nrepresentations aligned with the ground-truth CoT outputs using a contrastive\nloss. Subsequently, with the CoT model parameters frozen, the HCoT model is\nfine-tuned to generate accurate subsequent predictions conditioned on the\nprefix instruction and the compressed CoT representations from the CoT model.\nExtensive experiments across three challenging domains - mathematical\nreasoning, agent invocation, and question answering - demonstrate that our\nsemantic compression approach achieves competitive or improved performance\ncompared to the full CoT baseline, while providing significant speedups of at\nleast 1.5x in decoding time. Moreover, incorporating contrastive learning\nobjectives further enhances the quality of the compressed representations,\nleading to better CoT prompting and improved task accuracy. Our work paves the\nway for more efficient exploitation of multi-step reasoning capabilities in\nLLMs across a wide range of applications.\n","authors":["Tianqiao Liu","Zui Chen","Zitao Liu","Mi Tian","Weiqi Luo"],"pdf_url":"https://arxiv.org/pdf/2409.08561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08554v1","updated":"2024-09-13T06:13:55Z","published":"2024-09-13T06:13:55Z","title":"LLM-Powered Grapheme-to-Phoneme Conversion: Benchmark and Case Study","summary":" Grapheme-to-phoneme (G2P) conversion is critical in speech processing,\nparticularly for applications like speech synthesis. G2P systems must possess\nlinguistic understanding and contextual awareness of languages with polyphone\nwords and context-dependent phonemes. Large language models (LLMs) have\nrecently demonstrated significant potential in various language tasks,\nsuggesting that their phonetic knowledge could be leveraged for G2P. In this\npaper, we evaluate the performance of LLMs in G2P conversion and introduce\nprompting and post-processing methods that enhance LLM outputs without\nadditional training or labeled data. We also present a benchmarking dataset\ndesigned to assess G2P performance on sentence-level phonetic challenges of the\nPersian language. Our results show that by applying the proposed methods, LLMs\ncan outperform traditional G2P tools, even in an underrepresented language like\nPersian, highlighting the potential of developing LLM-aided G2P systems.\n","authors":["Mahta Fetrat Qharabagh","Zahra Dehghanian","Hamid R. Rabiee"],"pdf_url":"https://arxiv.org/pdf/2409.08554v1.pdf","comment":"5 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.13745v3","updated":"2024-09-13T05:50:11Z","published":"2024-08-25T07:10:36Z","title":"DOCE: Finding the Sweet Spot for Execution-Based Code Generation","summary":" Recently, a diverse set of decoding and reranking procedures have been shown\neffective for LLM-based code generation. However, a comprehensive framework\nthat links and experimentally compares these methods is missing. We address\nthis by proposing Decoding Objectives for Code Execution, a comprehensive\nframework that includes candidate generation, $n$-best reranking, minimum Bayes\nrisk (MBR) decoding, and self-debugging as the core components. We then study\nthe contributions of these components through execution-based evaluation\nmetrics. Our findings highlight the importance of execution-based methods and\nthe difference gap between execution-based and execution-free methods.\nFurthermore, we assess the impact of filtering based on trial unit tests, a\nsimple and effective strategy that has been often overlooked in prior works. We\nalso propose self-debugging on multiple candidates, obtaining state-of-the-art\nperformance on reranking for code generation. We expect our framework to\nprovide a solid guideline for future research on code generation.\n","authors":["Haau-Sing Li","Patrick Fernandes","Iryna Gurevych","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2408.13745v3.pdf","comment":"10 pages (32 including appendix), 5 figures, 25 tables. Prompts are\n provided in the GitHub repository to avoid potential text overlap with other\n papers"},{"id":"http://arxiv.org/abs/2408.09632v3","updated":"2024-09-13T05:34:14Z","published":"2024-08-19T01:30:14Z","title":"MoDeGPT: Modular Decomposition for Large Language Model Compression","summary":" Large Language Models (LLMs) have reshaped the landscape of artificial\nintelligence by demonstrating exceptional performance across various tasks.\nHowever, substantial computational requirements make their deployment\nchallenging on devices with limited resources. Recently, compression methods\nusing low-rank matrix techniques have shown promise, yet these often lead to\ndegraded accuracy or introduce significant overhead in parameters and inference\nlatency. This paper introduces \\textbf{Mo}dular \\textbf{De}composition\n(MoDeGPT), a novel structured compression framework that does not need recovery\nfine-tuning while resolving the above drawbacks. MoDeGPT partitions the\nTransformer block into modules comprised of matrix pairs and reduces the hidden\ndimensions via reconstructing the module-level outputs. MoDeGPT is developed\nbased on a theoretical framework that utilizes three well-established matrix\ndecomposition algorithms -- Nystr\\\"om approximation, CR decomposition, and SVD\n-- and applies them to our redefined transformer modules. Our comprehensive\nexperiments show MoDeGPT, without backward propagation, matches or surpasses\nprevious structured compression methods that rely on gradient information, and\nsaves 98% of compute costs on compressing a 13B model. On \\textsc{Llama}-2/3\nand OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30%\ncompression rates. Moreover, the compression can be done on a single GPU within\na few hours and increases the inference throughput by up to 46%.\n","authors":["Chi-Heng Lin","Shangqian Gao","James Seale Smith","Abhishek Patel","Shikhar Tuli","Yilin Shen","Hongxia Jin","Yen-Chang Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.09632v3.pdf","comment":"31 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.05074v4","updated":"2024-09-13T05:12:52Z","published":"2024-08-09T14:02:24Z","title":"RT-Surv: Improving Mortality Prediction After Radiotherapy with Large\n Language Model Structuring of Large-Scale Unstructured Electronic Health\n Records","summary":" Accurate patient selection is critical in radiotherapy (RT) to prevent\nineffective treatments. Traditional survival prediction models, relying on\nstructured data, often lack precision. This study explores the potential of\nlarge language models (LLMs) to structure unstructured electronic health record\n(EHR) data, thereby improving survival prediction accuracy through\ncomprehensive clinical information integration. Data from 34,276 patients\ntreated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed,\nencompassing both structured and unstructured data. An open-source LLM was used\nto structure the unstructured EHR data via single-shot learning, with its\nperformance compared against a domain-specific medical LLM and a smaller\nvariant. Survival prediction models were developed using statistical, machine\nlearning, and deep learning approaches, incorporating both structured and\nLLM-structured data. Clinical experts evaluated the accuracy of the\nLLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring\nunstructured EHR data without additional training, significantly outperforming\nthe domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs\nwere more effective, particularly in extracting clinically relevant features\nlike general condition and disease extent, which closely correlated with\npatient survival. Incorporating LLM-structured clinical features into survival\nprediction models significantly improved accuracy, with the C-index of deep\nlearning models increasing from 0.737 to 0.820. These models also became more\ninterpretable by emphasizing clinically significant factors. This study shows\nthat general-domain LLMs, even without specific medical training, can\neffectively structure large-scale unstructured EHR data, substantially\nenhancing the accuracy and interpretability of clinical predictive models.\n","authors":["Sangjoon Park","Chan Woo Wee","Seo Hee Choi","Kyung Hwan Kim","Jee Suk Chang","Hong In Yoon","Ik Jae Lee","Yong Bae Kim","Jaeho Cho","Ki Chang Keum","Chang Geol Lee","Hwa Kyung Byun","Woong Sub Koom"],"pdf_url":"https://arxiv.org/pdf/2408.05074v4.pdf","comment":"23 pages, 2 tables, 4 figures"},{"id":"http://arxiv.org/abs/2409.06639v3","updated":"2024-09-13T04:22:56Z","published":"2024-09-10T16:54:32Z","title":"TeXBLEU: Automatic Metric for Evaluate LaTeX Format","summary":" LaTeX is suitable for creating specially formatted documents in science,\ntechnology, mathematics, and computer science. Although the use of mathematical\nexpressions in LaTeX format along with language models is increasing, there are\nno proper evaluation matrices to evaluate them. In this study, we propose\nTeXBLEU, a metric for evaluating mathematical expressions in the LaTeX format\nbuilt on the n-gram-based BLEU metric widely used in translation tasks. The\nproposed TeXBLEU consists of a predefined tokenizer trained on the arXiv paper\ndataset and a fine-tuned embedding model with positional encoding. The TeXBLEU\nscore was calculated by replacing BLUE's modified precision score with the\nsimilarity of n-gram-based tokens. TeXBLEU showed improvements of 86\\%, 121\\%,\nand 610\\% over traditional evaluation metrics, such as BLEU, sacreBLEU, and\nRouge, respectively, on the MathBridge dataset with 1,000 data points. The code\nis available at https://github.com/KyuDan1/TeXBLEU.\n","authors":["Kyudan Jung","Nam-Joon Kim","Hyongon Ryu","Sieun Hyeon","Seung-jun Lee","Hyeok-jae Lee"],"pdf_url":"https://arxiv.org/pdf/2409.06639v3.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.08523v1","updated":"2024-09-13T04:06:00Z","published":"2024-09-13T04:06:00Z","title":"Eir: Thai Medical Large Language Models","summary":" We present Eir Thai Medical LLM, a large language model with 8 billion\nparameters, specifically designed to enhance the accuracy of handling medical\ntasks in the Thai language. This model focuses on providing clear and\neasy-to-understand answers for both healthcare professionals and patients,\nthereby improving the efficiency of diagnosis and treatment processes. Human\nevaluation was conducted to ensure that the model adheres to care standards and\nprovides unbiased answers.\n To prioritize data security, the model is deployed within the hospital's\ninternal network, ensuring both high security and faster processing speeds. The\ninternal API connection is secured with encryption and strict authentication\nmeasures to prevent data leaks and unauthorized access.\n We evaluated several open-source large language models with 8 billion\nparameters on four medical benchmarks: MedQA, MedMCQA, PubMedQA, and the\nmedical subset of MMLU. The best-performing baselines were used to develop Eir\nThai Medical LLM. Our evaluation employed multiple questioning strategies,\nincluding zero-shot, few-shot, chain-of-thought reasoning, and\nensemble/self-consistency voting methods. Our model outperformed commercially\navailable Thai-language large language models by more than 10%. In addition, we\ndeveloped enhanced model testing tailored for clinical use in Thai across 18\nclinical tasks, where our model exceeded GPT-4o performance by more than 11%\n","authors":["Yutthakorn Thiprak","Rungtam Ngodngamthaweesuk","Songtam Ngodngamtaweesuk"],"pdf_url":"https://arxiv.org/pdf/2409.08523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07556v4","updated":"2024-09-13T03:59:49Z","published":"2024-03-12T11:40:44Z","title":"Truth-Aware Context Selection: Mitigating Hallucinations of Large\n Language Models Being Misled by Untruthful Contexts","summary":" Although Large Language Models (LLMs) have demonstrated impressive text\ngeneration capabilities, they are easily misled by untruthful contexts provided\nby users or knowledge augmentation tools, leading to hallucinations. To\nalleviate LLMs from being misled by untruthful context and take advantage of\nknowledge augmentation, we propose Truth-Aware Context Selection (TACS), a\nlightweight method to adaptively recognize and mask untruthful context from the\ninputs. TACS begins by performing truth detection on the input context,\nleveraging the parameterized knowledge within the LLM. Subsequently, it\nconstructs a corresponding attention mask based on the truthfulness of each\nposition, selecting the truthful context and discarding the untruthful context.\nAdditionally, we introduce a new evaluation metric, Disturbance Adaption Rate,\nto further study the LLMs' ability to accept truthful information and resist\nuntruthful information. Experimental results indicate that TACS can effectively\nfilter untruthful context and significantly improve the overall quality of\nLLMs' responses when presented with misleading information.\n","authors":["Tian Yu","Shaolei Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2403.07556v4.pdf","comment":"Accepted to ACL 2024 Findings. Code is available at:\n https://github.com/ictnlp/TACS"},{"id":"http://arxiv.org/abs/2409.08522v1","updated":"2024-09-13T03:45:10Z","published":"2024-09-13T03:45:10Z","title":"MAPX: An explainable model-agnostic framework for the detection of false\n information on social media networks","summary":" The automated detection of false information has become a fundamental task in\ncombating the spread of \"fake news\" on online social media networks (OSMN) as\nit reduces the need for manual discernment by individuals. In the literature,\nleveraging various content or context features of OSMN documents have been\nfound useful. However, most of the existing detection models often utilise\nthese features in isolation without regard to the temporal and dynamic changes\noft-seen in reality, thus, limiting the robustness of the models. Furthermore,\nthere has been little to no consideration of the impact of the quality of\ndocuments' features on the trustworthiness of the final prediction. In this\npaper, we introduce a novel model-agnostic framework, called MAPX, which allows\nevidence based aggregation of predictions from existing models in an\nexplainable manner. Indeed, the developed aggregation method is adaptive,\ndynamic and considers the quality of OSMN document features. Further, we\nperform extensive experiments on benchmarked fake news datasets to demonstrate\nthe effectiveness of MAPX using various real-world data quality scenarios. Our\nempirical results show that the proposed framework consistently outperforms all\nstate-of-the-art models evaluated. For reproducibility, a demo of MAPX is\navailable at \\href{https://github.com/SCondran/MAPX_framework}{this link}\n","authors":["Sarah Condran","Michael Bewong","Selasi Kwashie","Md Zahidul Islam","Irfan Altas","Joshua Condran"],"pdf_url":"https://arxiv.org/pdf/2409.08522v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.12665v2","updated":"2024-09-13T03:07:37Z","published":"2024-07-17T15:48:39Z","title":"Patch-Level Training for Large Language Models","summary":" As Large Language Models (LLMs) achieve remarkable progress in language\nunderstanding and generation, their training efficiency has become a critical\nconcern. Traditionally, LLMs are trained to predict the next token in a\nsequence. Despite the success of token-level training, it suffers from\nconsiderable computational costs due to the need to process an extensive number\nof tokens. To mitigate this issue, this paper introduces patch-level training\nfor LLMs, which reduces the sequence length by compressing multiple tokens into\na single patch. During patch-level training, we feed the language model shorter\nsequences of patches and train it to predict the next patch, thereby processing\nthe majority of the training data at a significantly reduced computational\ncost. Following this, the model continues token-level training on the remaining\ntraining data to align with the inference mode. Experiments on a diverse range\nof models (370M-2.7B parameters) demonstrate that patch-level training can\nreduce overall computational costs to 0.5$\\times$, without compromising the\nmodel performance compared to token-level training. Source code:\n\\url{https://github.com/shaochenze/PatchTrain}.\n","authors":["Chenze Shao","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.12665v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01854v2","updated":"2024-09-13T02:47:09Z","published":"2024-04-02T11:32:58Z","title":"IndoCulture: Exploring Geographically-Influenced Cultural Commonsense\n Reasoning Across Eleven Indonesian Provinces","summary":" Although commonsense reasoning is greatly shaped by cultural and geographical\nfactors, previous studies have predominantly centered on cultures grounded in\nthe English language, potentially resulting in an Anglocentric bias. In this\npaper, we introduce IndoCulture, aimed at understanding the influence of\ngeographical factors on language model reasoning ability, with a specific\nemphasis on the diverse cultures found within eleven Indonesian provinces. In\ncontrast to prior work that has relied on templates (Yin et al., 2022) and\nonline scrapping (Fung et al., 2024), we create IndoCulture by asking local\npeople to manually develop a cultural context and plausible options, across a\nset of predefined topics. Evaluation of 27 language models reveals several\ninsights: (1) the open-weight Llama-3 is competitive with GPT-4, while other\nopen-weight models struggle, with accuracies below 50%; (2) there is a general\npattern of models generally performing better for some provinces, such as Bali\nand West Java, and less well for others; and (3) the inclusion of location\ncontext enhances performance, especially for larger models like GPT-4,\nemphasizing the significance of geographical context in commonsense reasoning.\n","authors":["Fajri Koto","Rahmad Mahendra","Nurul Aisyah","Timothy Baldwin"],"pdf_url":"https://arxiv.org/pdf/2404.01854v2.pdf","comment":"Accepted at TACL"},{"id":"http://arxiv.org/abs/2409.06223v2","updated":"2024-09-13T02:17:58Z","published":"2024-09-10T05:26:53Z","title":"Enhancing Temporal Understanding in Audio Question Answering for Large\n Audio Language Models","summary":" The Audio Question Answering task includes audio event classification, audio\ncaptioning, and open ended reasoning. Recently, Audio Question Answering has\ngarnered attention due to the advent of Large Audio Language Models. Current\nliterature focuses on constructing LALMs by integrating audio encoders with\ntext only Large Language Models through a projection module. While Large Audio\nLanguage Models excel in general audio understanding, they are limited in\ntemporal reasoning which may hinder their commercial applications and on device\ndeployment. This paper addresses these challenges and limitations in audio\ntemporal reasoning. First, we introduce a data augmentation technique for\ngenerating reliable audio temporal questions and answers using an LLM. Second,\nwe propose a continued finetuning curriculum learning strategy to specialize in\ntemporal reasoning without compromising performance on finetuned tasks.\nFinally, we develop a reliable and transparent automated metric, assisted by an\nLLM, to measure the correlation between Large Audio Language Model responses\nand ground truth data intelligently. We demonstrate the effectiveness of our\nproposed techniques using SOTA LALMs on public audio benchmark datasets.\n","authors":["Arvind Krishna Sridhar","Yinyi Guo","Erik Visser"],"pdf_url":"https://arxiv.org/pdf/2409.06223v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.08483v1","updated":"2024-09-13T02:14:34Z","published":"2024-09-13T02:14:34Z","title":"A BERT-Based Summarization approach for depression detection","summary":" Depression is a globally prevalent mental disorder with potentially severe\nrepercussions if not addressed, especially in individuals with recurrent\nepisodes. Prior research has shown that early intervention has the potential to\nmitigate or alleviate symptoms of depression. However, implementing such\ninterventions in a real-world setting may pose considerable challenges. A\npromising strategy involves leveraging machine learning and artificial\nintelligence to autonomously detect depression indicators from diverse data\nsources. One of the most widely available and informative data sources is text,\nwhich can reveal a person's mood, thoughts, and feelings. In this context,\nvirtual agents programmed to conduct interviews using clinically validated\nquestionnaires, such as those found in the DAIC-WOZ dataset, offer a robust\nmeans for depression detection through linguistic analysis. Utilizing\nBERT-based models, which are powerful and versatile yet use fewer resources\nthan contemporary large language models, to convert text into numerical\nrepresentations significantly enhances the precision of depression diagnosis.\nThese models adeptly capture complex semantic and syntactic nuances, improving\nthe detection accuracy of depressive symptoms. Given the inherent limitations\nof these models concerning text length, our study proposes text summarization\nas a preprocessing technique to diminish the length and intricacies of input\ntexts. Implementing this method within our uniquely developed framework for\nfeature extraction and classification yielded an F1-score of 0.67 on the test\nset surpassing all prior benchmarks and 0.81 on the validation set exceeding\nmost previous results on the DAIC-WOZ dataset. Furthermore, we have devised a\ndepression lexicon to assess summary quality and relevance. This lexicon\nconstitutes a valuable asset for ongoing research in depression detection.\n","authors":["Hossein Salahshoor Gavalan","Mohmmad Naim Rastgoo","Bahareh Nakisa"],"pdf_url":"https://arxiv.org/pdf/2409.08483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16677v2","updated":"2024-09-13T01:56:05Z","published":"2024-05-26T19:58:38Z","title":"Crossmodal ASR Error Correction with Discrete Speech Units","summary":" ASR remains unsatisfactory in scenarios where the speaking style diverges\nfrom that used to train ASR systems, resulting in erroneous transcripts. To\naddress this, ASR Error Correction (AEC), a post-ASR processing approach, is\nrequired. In this work, we tackle an understudied issue: the Low-Resource\nOut-of-Domain (LROOD) problem, by investigating crossmodal AEC on very limited\ndownstream data with 1-best hypothesis transcription. We explore pre-training\nand fine-tuning strategies and uncover an ASR domain discrepancy phenomenon,\nshedding light on appropriate training schemes for LROOD data. Moreover, we\npropose the incorporation of discrete speech units to align with and enhance\nthe word embeddings for improving AEC quality. Results from multiple corpora\nand several evaluation metrics demonstrate the feasibility and efficacy of our\nproposed AEC approach on LROOD data as well as its generalizability and\nsuperiority on large-scale data. Finally, a study on speech emotion recognition\nconfirms that our model produces ASR error-robust transcripts suitable for\ndownstream applications.\n","authors":["Yuanchao Li","Pinzhen Chen","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2405.16677v2.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2406.08353v2","updated":"2024-09-13T01:48:15Z","published":"2024-06-12T15:59:25Z","title":"Speech Emotion Recognition with ASR Transcripts: A Comprehensive Study\n on Word Error Rate and Fusion Techniques","summary":" Text data is commonly utilized as a primary input to enhance Speech Emotion\nRecognition (SER) performance and reliability. However, the reliance on\nhuman-transcribed text in most studies impedes the development of practical SER\nsystems, creating a gap between in-lab research and real-world scenarios where\nAutomatic Speech Recognition (ASR) serves as the text source. Hence, this study\nbenchmarks SER performance using ASR transcripts with varying Word Error Rates\n(WERs) from eleven models on three well-known corpora: IEMOCAP, CMU-MOSI, and\nMSP-Podcast. Our evaluation includes both text-only and bimodal SER with six\nfusion techniques, aiming for a comprehensive analysis that uncovers novel\nfindings and challenges faced by current SER research. Additionally, we propose\na unified ASR error-robust framework integrating ASR error correction and\nmodality-gated fusion, achieving lower WER and higher SER results compared to\nthe best-performing ASR transcript. These findings provide insights into SER\nwith ASR assistance, especially for real-world applications.\n","authors":["Yuanchao Li","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2406.08353v2.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.08466v1","updated":"2024-09-13T01:40:20Z","published":"2024-09-13T01:40:20Z","title":"Explaining Datasets in Words: Statistical Models with Natural Language\n Parameters","summary":" To make sense of massive data, we often fit simplified models and then\ninterpret the parameters; for example, we cluster the text embeddings and then\ninterpret the mean parameters of each cluster. However, these parameters are\noften high-dimensional and hard to interpret. To make model parameters directly\ninterpretable, we introduce a family of statistical models -- including\nclustering, time series, and classification models -- parameterized by natural\nlanguage predicates. For example, a cluster of text about COVID could be\nparameterized by the predicate \"discusses COVID\". To learn these statistical\nmodels effectively, we develop a model-agnostic algorithm that optimizes\ncontinuous relaxations of predicate parameters with gradient descent and\ndiscretizes them by prompting language models (LMs). Finally, we apply our\nframework to a wide range of problems: taxonomizing user chat dialogues,\ncharacterizing how they evolve across time, finding categories where one\nlanguage model is better than the other, clustering math problems based on\nsubareas, and explaining visual features in memorable images. Our framework is\nhighly versatile, applicable to both textual and visual domains, can be easily\nsteered to focus on specific properties (e.g. subareas), and explains\nsophisticated concepts that classical methods (e.g. n-gram analysis) struggle\nto produce.\n","authors":["Ruiqi Zhong","Heng Wang","Dan Klein","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2409.08466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01332v2","updated":"2024-09-13T00:56:01Z","published":"2024-03-29T22:49:43Z","title":"Explaining Large Language Models Decisions with Shapley Values","summary":" The emergence of large language models (LLMs) has opened up exciting\npossibilities for simulating human behavior and cognitive processes, with\npotential applications in various domains, including marketing research and\nconsumer behavior analysis. However, the validity of utilizing LLMs as\nstand-ins for human subjects remains uncertain due to glaring divergences that\nsuggest fundamentally different underlying processes at play and the\nsensitivity of LLM responses to prompt variations. This paper presents a novel\napproach based on Shapley values from cooperative game theory to interpret LLM\nbehavior and quantify the relative contribution of each prompt component to the\nmodel's output. Through two applications - a discrete choice experiment and an\ninvestigation of cognitive biases - we demonstrate how the Shapley value method\ncan uncover what we term \"token noise\" effects, a phenomenon where LLM\ndecisions are disproportionately influenced by tokens providing minimal\ninformative content. This phenomenon raises concerns about the robustness and\ngeneralizability of insights obtained from LLMs in the context of human\nbehavior simulation. Our model-agnostic approach extends its utility to\nproprietary LLMs, providing a valuable tool for practitioners and researchers\nto strategically optimize prompts and mitigate apparent cognitive biases. Our\nfindings underscore the need for a more nuanced understanding of the factors\ndriving LLM responses before relying on them as substitutes for human subjects\nin survey settings. We emphasize the importance of researchers reporting\nresults conditioned on specific prompt templates and exercising caution when\ndrawing parallels between human behavior and LLMs.\n","authors":["Behnam Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2404.01332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08435v1","updated":"2024-09-13T00:03:19Z","published":"2024-09-13T00:03:19Z","title":"When Context Leads but Parametric Memory Follows in Large Language\n Models","summary":" Large language models (LLMs) have demonstrated remarkable progress in\nleveraging diverse knowledge sources. This study investigates how nine widely\nused LLMs allocate knowledge between local context and global parameters when\nanswering open-ended questions in knowledge-consistent scenarios. We introduce\na novel dataset, WikiAtomic, and systematically vary context sizes to analyze\nhow LLMs prioritize and utilize the provided information and their parametric\nknowledge in knowledge-consistent scenarios. Additionally, we also study their\ntendency to hallucinate under varying context sizes. Our findings reveal\nconsistent patterns across models, including a consistent reliance on both\ncontextual (around 70%) and parametric (around 30%) knowledge, and a decrease\nin hallucinations with increasing context. These insights highlight the\nimportance of more effective context organization and developing models that\nuse input more deterministically for robust performance.\n","authors":["Yufei Tao","Adam Hiatt","Erik Haake","Antonie J. Jetter","Ameeta Agrawal"],"pdf_url":"https://arxiv.org/pdf/2409.08435v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.09018v1","updated":"2024-09-13T17:45:53Z","published":"2024-09-13T17:45:53Z","title":"An Efficient and Streaming Audio Visual Active Speaker Detection System","summary":" This paper delves into the challenging task of Active Speaker Detection\n(ASD), where the system needs to determine in real-time whether a person is\nspeaking or not in a series of video frames. While previous works have made\nsignificant strides in improving network architectures and learning effective\nrepresentations for ASD, a critical gap exists in the exploration of real-time\nsystem deployment. Existing models often suffer from high latency and memory\nusage, rendering them impractical for immediate applications. To bridge this\ngap, we present two scenarios that address the key challenges posed by\nreal-time constraints. First, we introduce a method to limit the number of\nfuture context frames utilized by the ASD model. By doing so, we alleviate the\nneed for processing the entire sequence of future frames before a decision is\nmade, significantly reducing latency. Second, we propose a more stringent\nconstraint that limits the total number of past frames the model can access\nduring inference. This tackles the persistent memory issues associated with\nrunning streaming ASD systems. Beyond these theoretical frameworks, we conduct\nextensive experiments to validate our approach. Our results demonstrate that\nconstrained transformer models can achieve performance comparable to or even\nbetter than state-of-the-art recurrent models, such as uni-directional GRUs,\nwith a significantly reduced number of context frames. Moreover, we shed light\non the temporal memory requirements of ASD systems, revealing that larger past\ncontext has a more profound impact on accuracy than future context. When\nprofiling on a CPU we find that our efficient architecture is memory bound by\nthe amount of past context it can use and that the compute cost is negligible\nas compared to the memory cost.\n","authors":["Arnav Kundu","Yanzi Jin","Mohammad Sekhavat","Max Horton","Danny Tormoen","Devang Naik"],"pdf_url":"https://arxiv.org/pdf/2409.09018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19331v2","updated":"2024-09-13T17:41:21Z","published":"2024-05-29T17:58:09Z","title":"NPGA: Neural Parametric Gaussian Avatars","summary":" The creation of high-fidelity, digital versions of human heads is an\nimportant stepping stone in the process of further integrating virtual\ncomponents into our everyday lives. Constructing such avatars is a challenging\nresearch problem, due to a high demand for photo-realism and real-time\nrendering performance. In this work, we propose Neural Parametric Gaussian\nAvatars (NPGA), a data-driven approach to create high-fidelity, controllable\navatars from multi-view video recordings. We build our method around 3D\nGaussian splatting for its highly efficient rendering and to inherit the\ntopological flexibility of point clouds. In contrast to previous work, we\ncondition our avatars' dynamics on the rich expression space of neural\nparametric head models (NPHM), instead of mesh-based 3DMMs. To this end, we\ndistill the backward deformation field of our underlying NPHM into forward\ndeformations which are compatible with rasterization-based rendering. All\nremaining fine-scale, expression-dependent details are learned from the\nmulti-view videos. For increased representational capacity of our avatars, we\npropose per-Gaussian latent features that condition each primitives dynamic\nbehavior. To regularize this increased dynamic expressivity, we propose\nLaplacian terms on the latent features and predicted dynamics. We evaluate our\nmethod on the public NeRSemble dataset, demonstrating that NPGA significantly\noutperforms the previous state-of-the-art avatars on the self-reenactment task\nby 2.6 PSNR. Furthermore, we demonstrate accurate animation capabilities from\nreal-world monocular videos.\n","authors":["Simon Giebenhain","Tobias Kirschstein","Martin Rünz","Lourdes Agapito","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2405.19331v2.pdf","comment":"Project Page: see https://simongiebenhain.github.io/NPGA/ ; Youtube\n Video: see https://youtu.be/t0S0OK7WnA4"},{"id":"http://arxiv.org/abs/2405.06468v3","updated":"2024-09-13T16:44:17Z","published":"2024-05-10T13:27:32Z","title":"Pseudo-Prompt Generating in Pre-trained Vision-Language Models for\n Multi-Label Medical Image Classification","summary":" The task of medical image recognition is notably complicated by the presence\nof varied and multiple pathological indications, presenting a unique challenge\nin multi-label classification with unseen labels. This complexity underlines\nthe need for computer-aided diagnosis methods employing multi-label zero-shot\nlearning. Recent advancements in pre-trained vision-language models (VLMs) have\nshowcased notable zero-shot classification abilities on medical images.\nHowever, these methods have limitations on leveraging extensive pre-trained\nknowledge from broader image datasets, and often depend on manual prompt\nconstruction by expert radiologists. By automating the process of prompt\ntuning, prompt learning techniques have emerged as an efficient way to adapt\nVLMs to downstream tasks. Yet, existing CoOp-based strategies fall short in\nperforming class-specific prompts on unseen categories, limiting\ngeneralizability in fine-grained scenarios. To overcome these constraints, we\nintroduce a novel prompt generation approach inspirited by text generation in\nnatural language processing (NLP). Our method, named Pseudo-Prompt Generating\n(PsPG), capitalizes on the priori knowledge of multi-modal features. Featuring\na RNN-based decoder, PsPG autoregressively generates class-tailored embedding\nvectors, i.e., pseudo-prompts. Comparative evaluations on various multi-label\nchest radiograph datasets affirm the superiority of our approach against\nleading medical vision-language and multi-label prompt learning methods. The\nsource code is available at https://github.com/fallingnight/PsPG\n","authors":["Yaoqin Ye","Junjie Zhang","Hongwei Shi"],"pdf_url":"https://arxiv.org/pdf/2405.06468v3.pdf","comment":"Accepted by PRCV 2024"},{"id":"http://arxiv.org/abs/2409.08953v1","updated":"2024-09-13T16:14:45Z","published":"2024-09-13T16:14:45Z","title":"Pushing the boundaries of event subsampling in event-based video\n classification using CNNs","summary":" Event cameras offer low-power visual sensing capabilities ideal for\nedge-device applications. However, their high event rate, driven by high\ntemporal details, can be restrictive in terms of bandwidth and computational\nresources. In edge AI applications, determining the minimum amount of events\nfor specific tasks can allow reducing the event rate to improve bandwidth,\nmemory, and processing efficiency. In this paper, we study the effect of event\nsubsampling on the accuracy of event data classification using convolutional\nneural network (CNN) models. Surprisingly, across various datasets, the number\nof events per video can be reduced by an order of magnitude with little drop in\naccuracy, revealing the extent to which we can push the boundaries in accuracy\nvs. event rate trade-off. Additionally, we also find that lower classification\naccuracy in high subsampling rates is not solely attributable to information\nloss due to the subsampling of the events, but that the training of CNNs can be\nchallenging in highly subsampled scenarios, where the sensitivity to\nhyperparameters increases. We quantify training instability across multiple\nevent-based classification datasets using a novel metric for evaluating the\nhyperparameter sensitivity of CNNs in different subsampling settings. Finally,\nwe analyze the weight gradients of the network to gain insight into this\ninstability.\n","authors":["Hesam Araghi","Jan van Gemert","Nergis Tomen"],"pdf_url":"https://arxiv.org/pdf/2409.08953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08947v1","updated":"2024-09-13T16:07:25Z","published":"2024-09-13T16:07:25Z","title":"A Diffusion Approach to Radiance Field Relighting using\n Multi-Illumination Synthesis","summary":" Relighting radiance fields is severely underconstrained for multi-view data,\nwhich is most often captured under a single illumination condition; It is\nespecially hard for full scenes containing multiple objects. We introduce a\nmethod to create relightable radiance fields using such single-illumination\ndata by exploiting priors extracted from 2D image diffusion models. We first\nfine-tune a 2D diffusion model on a multi-illumination dataset conditioned by\nlight direction, allowing us to augment a single-illumination capture into a\nrealistic -- but possibly inconsistent -- multi-illumination dataset from\ndirectly defined light directions. We use this augmented data to create a\nrelightable radiance field represented by 3D Gaussian splats. To allow direct\ncontrol of light direction for low-frequency lighting, we represent appearance\nwith a multi-layer perceptron parameterized on light direction. To enforce\nmulti-view consistency and overcome inaccuracies we optimize a per-image\nauxiliary feature vector. We show results on synthetic and real multi-view data\nunder single illumination, demonstrating that our method successfully exploits\n2D diffusion model priors to allow realistic 3D relighting for complete scenes.\nProject site\nhttps://repo-sam.inria.fr/fungraph/generative-radiance-field-relighting/\n","authors":["Yohan Poirier-Ginter","Alban Gauthier","Julien Phillip","Jean-Francois Lalonde","George Drettakis"],"pdf_url":"https://arxiv.org/pdf/2409.08947v1.pdf","comment":"Project site\n https://repo-sam.inria.fr/fungraph/generative-radiance-field-relighting/"},{"id":"http://arxiv.org/abs/2409.08943v1","updated":"2024-09-13T16:01:27Z","published":"2024-09-13T16:01:27Z","title":"Pushing Joint Image Denoising and Classification to the Edge","summary":" In this paper, we jointly combine image classification and image denoising,\naiming to enhance human perception of noisy images captured by edge devices,\nlike low-light security cameras. In such settings, it is important to retain\nthe ability of humans to verify the automatic classification decision and thus\njointly denoise the image to enhance human perception. Since edge devices have\nlittle computational power, we explicitly optimize for efficiency by proposing\na novel architecture that integrates the two tasks. Additionally, we alter a\nNeural Architecture Search (NAS) method, which searches for classifiers to\nsearch for the integrated model while optimizing for a target latency,\nclassification accuracy, and denoising performance. The NAS architectures\noutperform our manually designed alternatives in both denoising and\nclassification, offering a significant improvement to human perception. Our\napproach empowers users to construct architectures tailored to domains like\nmedical imaging, surveillance systems, and industrial inspections.\n","authors":["Thomas C Markhorst","Jan C van Gemert","Osman S Kayhan"],"pdf_url":"https://arxiv.org/pdf/2409.08943v1.pdf","comment":"Accepted paper at the ECCV 2024 workshop on Advances in Image\n Manipulation (AIM)"},{"id":"http://arxiv.org/abs/2409.08926v1","updated":"2024-09-13T15:44:38Z","published":"2024-09-13T15:44:38Z","title":"ClearDepth: Enhanced Stereo Perception of Transparent Objects for\n Robotic Manipulation","summary":" Transparent object depth perception poses a challenge in everyday life and\nlogistics, primarily due to the inability of standard 3D sensors to accurately\ncapture depth on transparent or reflective surfaces. This limitation\nsignificantly affects depth map and point cloud-reliant applications,\nespecially in robotic manipulation. We developed a vision transformer-based\nalgorithm for stereo depth recovery of transparent objects. This approach is\ncomplemented by an innovative feature post-fusion module, which enhances the\naccuracy of depth recovery by structural features in images. To address the\nhigh costs associated with dataset collection for stereo camera-based\nperception of transparent objects, our method incorporates a parameter-aligned,\ndomain-adaptive, and physically realistic Sim2Real simulation for efficient\ndata generation, accelerated by AI algorithm. Our experimental results\ndemonstrate the model's exceptional Sim2Real generalizability in real-world\nscenarios, enabling precise depth mapping of transparent objects to assist in\nrobotic manipulation. Project details are available at\nhttps://sites.google.com/view/cleardepth/ .\n","authors":["Kaixin Bai","Huajian Zeng","Lei Zhang","Yiwen Liu","Hongli Xu","Zhaopeng Chen","Jianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.08926v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2302.13080v3","updated":"2024-09-13T15:38:23Z","published":"2023-02-25T13:58:37Z","title":"Does a Neural Network Really Encode Symbolic Concepts?","summary":" Recently, a series of studies have tried to extract interactions between\ninput variables modeled by a DNN and define such interactions as concepts\nencoded by the DNN. However, strictly speaking, there still lacks a solid\nguarantee whether such interactions indeed represent meaningful concepts.\nTherefore, in this paper, we examine the trustworthiness of interaction\nconcepts from four perspectives. Extensive empirical studies have verified that\na well-trained DNN usually encodes sparse, transferable, and discriminative\nconcepts, which is partially aligned with human intuition.\n","authors":["Mingjie Li","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13080v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08906v1","updated":"2024-09-13T15:20:03Z","published":"2024-09-13T15:20:03Z","title":"Gaussian is All You Need: A Unified Framework for Solving Inverse\n Problems via Diffusion Posterior Sampling","summary":" Diffusion models can generate a variety of high-quality images by modeling\ncomplex data distributions. Trained diffusion models can also be very effective\nimage priors for solving inverse problems. Most of the existing diffusion-based\nmethods integrate data consistency steps within the diffusion reverse sampling\nprocess. The data consistency steps rely on an approximate likelihood function.\nIn this paper, we show that the existing approximations are either insufficient\nor computationally inefficient. To address these issues, we propose a unified\nlikelihood approximation method that incorporates a covariance correction term\nto enhance the performance and avoids propagating gradients through the\ndiffusion model. The correction term, when integrated into the reverse\ndiffusion sampling process, achieves better convergence towards the true data\nposterior for selected distributions and improves performance on real-world\nnatural image datasets. Furthermore, we present an efficient way to factorize\nand invert the covariance matrix of the likelihood function for several inverse\nproblems. We present comprehensive experiments to demonstrate the effectiveness\nof our method over several existing approaches.\n","authors":["Nebiyou Yismaw","Ulugbek S. Kamilov","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2409.08906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08905v1","updated":"2024-09-13T15:16:28Z","published":"2024-09-13T15:16:28Z","title":"D2-MLP: Dynamic Decomposed MLP Mixer for Medical Image Segmentation","summary":" Convolutional neural networks are widely used in various segmentation tasks\nin medical images. However, they are challenged to learn global features\nadaptively due to the inherent locality of convolutional operations. In\ncontrast, MLP Mixers are proposed as a backbone to learn global information\nacross channels with low complexity. However, they cannot capture spatial\nfeatures efficiently. Additionally, they lack effective mechanisms to fuse and\nmix features adaptively. To tackle these limitations, we propose a novel\nDynamic Decomposed Mixer module. It is designed to employ novel Mixers to\nextract features and aggregate information across different spatial locations\nand channels. Additionally, it employs novel dynamic mixing mechanisms to model\ninter-dependencies between channel and spatial feature representations and to\nfuse them adaptively. Subsequently, we incorporate it into a U-shaped\nTransformer-based architecture to generate a novel network, termed the Dynamic\nDecomposed MLP Mixer. We evaluated it for medical image segmentation on two\ndatasets, and it achieved superior segmentation performance than other\nstate-of-the-art methods.\n","authors":["Jin Yang","Xiaobing Yu","Peijie Qiu"],"pdf_url":"https://arxiv.org/pdf/2409.08905v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.06110v3","updated":"2024-09-13T15:09:39Z","published":"2023-05-10T12:54:02Z","title":"Pavlok-Nudge: A Feedback Mechanism for Atomic Behaviour Modification\n with Snoring Usecase","summary":" This paper proposes a feedback mechanism to change behavioural patterns using\nthe Pavlok device. Pavlok utilises beeps, vibration and shocks as a mode of\naversion technique to help individuals with behaviour modification. While the\ndevice can be useful in certain periodic daily life situations, like alarms and\nexercise notifications, the device relies on manual operations that limit its\nusage. To automate behaviour modification, we propose a framework that first\ndetects targeted behaviours through a lightweight deep learning model and\nsubsequently nudges the user through Pavlok. Our proposed solution is\nimplemented and verified in the context of snoring, which captures audio from\nthe environment following a prediction of whether the audio content is a snore\nor not using a 1D convolutional neural network. Based on the prediction, we use\nPavlok to nudge users for preventive measures, such as a change in sleeping\nposture. We believe that this simple solution can help people to change their\natomic habits, which may lead to long-term health benefits. Our proposed\nreal-time, lightweight model (99.8% less parameters over SOTA; 1,278,049 -->\n1337) achieves SOTA performance (test accuracy of 0.99) on a public domain\nbenchmark. The code and model are publicly available at\nhttps://github.com/hasan-rakibul/pavlok-nudge-snore.\n","authors":["Md Rakibul Hasan","Shreya Ghosh","Pradyumna Agrawal","Zhixi Cai","Abhinav Dhall","Tom Gedeon"],"pdf_url":"https://arxiv.org/pdf/2305.06110v3.pdf","comment":"Md Rakibul Hasan and Shreya Ghosh are co-first authors"},{"id":"http://arxiv.org/abs/2310.07248v4","updated":"2024-09-13T15:06:26Z","published":"2023-10-11T07:25:50Z","title":"IBoxCLA: Towards Robust Box-supervised Segmentation of Polyp via\n Improved Box-dice and Contrastive Latent-anchors","summary":" Box-supervised polyp segmentation attracts increasing attention for its\ncost-effective potential. Existing solutions often rely on learning-free\nmethods or pretrained models to laboriously generate pseudo masks, triggering\nDice constraint subsequently. In this paper, we found that a model guided by\nthe simplest box-filled masks can accurately predict polyp locations/sizes, but\nsuffers from shape collapsing. In response, we propose two innovative learning\nfashions, Improved Box-dice (IBox) and Contrastive Latent-Anchors (CLA), and\ncombine them to train a robust box-supervised model IBoxCLA. The core idea\nbehind IBoxCLA is to decouple the learning of location/size and shape, allowing\nfor focused constraints on each of them. Specifically, IBox transforms the\nsegmentation map into a proxy map using shape decoupling and confusion-region\nswapping sequentially. Within the proxy map, shapes are disentangled, while\nlocations/sizes are encoded as box-like responses. By constraining the proxy\nmap instead of the raw prediction, the box-filled mask can well supervise\nIBoxCLA without misleading its shape learning. Furthermore, CLA contributes to\nshape learning by generating two types of latent anchors, which are learned and\nupdated using momentum and segmented polyps to steadily represent polyp and\nbackground features. The latent anchors facilitate IBoxCLA to capture\ndiscriminative features within and outside boxes in a contrastive manner,\nyielding clearer boundaries. We benchmark IBoxCLA on five public polyp\ndatasets. The experimental results demonstrate the competitive performance of\nIBoxCLA compared to recent fully-supervised polyp segmentation methods, and its\nsuperiority over other box-supervised state-of-the-arts with a relative\nincrease of overall mDice and mIoU by at least 6.5% and 7.5%, respectively.\n","authors":["Zhiwei Wang","Qiang Hu","Hongkuan Shi","Li He","Man He","Wenxuan Dai","Yinjiao Tian","Xin Yang","Mei Liu","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2310.07248v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08887v1","updated":"2024-09-13T14:54:37Z","published":"2024-09-13T14:54:37Z","title":"Visual Language Tracking with Multi-modal Interaction: A Robust\n Benchmark","summary":" Visual Language Tracking (VLT) enhances tracking by mitigating the\nlimitations of relying solely on the visual modality, utilizing high-level\nsemantic information through language. This integration of the language enables\nmore advanced human-machine interaction. The essence of interaction is\ncognitive alignment, which typically requires multiple information exchanges,\nespecially in the sequential decision-making process of VLT. However, current\nVLT benchmarks do not account for multi-round interactions during tracking.\nThey provide only an initial text and bounding box (bbox) in the first frame,\nwith no further interaction as tracking progresses, deviating from the original\nmotivation of the VLT task. To address these limitations, we propose a novel\nand robust benchmark, VLT-MI (Visual Language Tracking with Multi-modal\nInteraction), which introduces multi-round interaction into the VLT task for\nthe first time. (1) We generate diverse, multi-granularity texts for\nmulti-round, multi-modal interaction based on existing mainstream VLT\nbenchmarks using DTLLM-VLT, leveraging the world knowledge of LLMs. (2) We\npropose a new VLT interaction paradigm that achieves multi-round interaction\nthrough text updates and object recovery. When multiple tracking failures\noccur, we provide the tracker with more aligned texts and corrected bboxes\nthrough interaction, thereby expanding the scope of VLT downstream tasks. (3)\nWe conduct comparative experiments on both traditional VLT benchmarks and\nVLT-MI, evaluating and analyzing the accuracy and robustness of trackers under\nthe interactive paradigm. This work offers new insights and paradigms for the\nVLT task, enabling a fine-grained evaluation of multi-modal trackers. We\nbelieve this approach can be extended to additional datasets in the future,\nsupporting broader evaluations and comparisons of video-language model\ncapabilities.\n","authors":["Xuchen Li","Shiyu Hu","Xiaokun Feng","Dailing Zhang","Meiqi Wu","Jing Zhang","Kaiqi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.08887v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2409.08885v1","updated":"2024-09-13T14:50:50Z","published":"2024-09-13T14:50:50Z","title":"Interactive Masked Image Modeling for Multimodal Object Detection in\n Remote Sensing","summary":" Object detection in remote sensing imagery plays a vital role in various\nEarth observation applications. However, unlike object detection in natural\nscene images, this task is particularly challenging due to the abundance of\nsmall, often barely visible objects across diverse terrains. To address these\nchallenges, multimodal learning can be used to integrate features from\ndifferent data modalities, thereby improving detection accuracy. Nonetheless,\nthe performance of multimodal learning is often constrained by the limited size\nof labeled datasets. In this paper, we propose to use Masked Image Modeling\n(MIM) as a pre-training technique, leveraging self-supervised learning on\nunlabeled data to enhance detection performance. However, conventional MIM such\nas MAE which uses masked tokens without any contextual information, struggles\nto capture the fine-grained details due to a lack of interactions with other\nparts of image. To address this, we propose a new interactive MIM method that\ncan establish interactions between different tokens, which is particularly\nbeneficial for object detection in remote sensing. The extensive ablation\nstudies and evluation demonstrate the effectiveness of our approach.\n","authors":["Minh-Duc Vu","Zuheng Ming","Fangchen Feng","Bissmella Bahaduri","Anissa Mokraoui"],"pdf_url":"https://arxiv.org/pdf/2409.08885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08884v1","updated":"2024-09-13T14:50:14Z","published":"2024-09-13T14:50:14Z","title":"Detect Fake with Fake: Leveraging Synthetic Data-driven Representation\n for Synthetic Image Detection","summary":" Are general-purpose visual representations acquired solely from synthetic\ndata useful for detecting fake images? In this work, we show the effectiveness\nof synthetic data-driven representations for synthetic image detection. Upon\nanalysis, we find that vision transformers trained by the latest visual\nrepresentation learners with synthetic data can effectively distinguish fake\nfrom real images without seeing any real images during pre-training. Notably,\nusing SynCLR as the backbone in a state-of-the-art detection method\ndemonstrates a performance improvement of +10.32 mAP and +4.73% accuracy over\nthe widely used CLIP, when tested on previously unseen GAN models. Code is\navailable at https://github.com/cvpaperchallenge/detect-fake-with-fake.\n","authors":["Hina Otake","Yoshihiro Fukuhara","Yoshiki Kubotani","Shigeo Morishima"],"pdf_url":"https://arxiv.org/pdf/2409.08884v1.pdf","comment":"Accepted to TWYN workshop at ECCV 2024"},{"id":"http://arxiv.org/abs/2409.08857v1","updated":"2024-09-13T14:19:27Z","published":"2024-09-13T14:19:27Z","title":"InstantDrag: Improving Interactivity in Drag-based Image Editing","summary":" Drag-based image editing has recently gained popularity for its interactivity\nand precision. However, despite the ability of text-to-image models to generate\nsamples within a second, drag editing still lags behind due to the challenge of\naccurately reflecting user interaction while maintaining image content. Some\nexisting approaches rely on computationally intensive per-image optimization or\nintricate guidance-based methods, requiring additional inputs such as masks for\nmovable regions and text prompts, thereby compromising the interactivity of the\nediting process. We introduce InstantDrag, an optimization-free pipeline that\nenhances interactivity and speed, requiring only an image and a drag\ninstruction as input. InstantDrag consists of two carefully designed networks:\na drag-conditioned optical flow generator (FlowGen) and an optical\nflow-conditioned diffusion model (FlowDiffusion). InstantDrag learns motion\ndynamics for drag-based image editing in real-world video datasets by\ndecomposing the task into motion generation and motion-conditioned image\ngeneration. We demonstrate InstantDrag's capability to perform fast,\nphoto-realistic edits without masks or text prompts through experiments on\nfacial video datasets and general scenes. These results highlight the\nefficiency of our approach in handling drag-based image editing, making it a\npromising solution for interactive, real-time applications.\n","authors":["Joonghyuk Shin","Daehyeon Choi","Jaesik Park"],"pdf_url":"https://arxiv.org/pdf/2409.08857v1.pdf","comment":"SIGGRAPH Asia 2024. Project webpage at\n https://joonghyuk.com/instantdrag-web/"},{"id":"http://arxiv.org/abs/2409.07003v2","updated":"2024-09-13T14:17:17Z","published":"2024-09-11T04:31:09Z","title":"ODYSSEE: Oyster Detection Yielded by Sensor Systems on Edge Electronics","summary":" Oysters are a vital keystone species in coastal ecosystems, providing\nsignificant economic, environmental, and cultural benefits. As the importance\nof oysters grows, so does the relevance of autonomous systems for their\ndetection and monitoring. However, current monitoring strategies often rely on\ndestructive methods. While manual identification of oysters from video footage\nis non-destructive, it is time-consuming, requires expert input, and is further\ncomplicated by the challenges of the underwater environment.\n To address these challenges, we propose a novel pipeline using stable\ndiffusion to augment a collected real dataset with realistic synthetic data.\nThis method enhances the dataset used to train a YOLOv10-based vision model.\nThe model is then deployed and tested on an edge platform in underwater\nrobotics, achieving a state-of-the-art 0.657 mAP@50 for oyster detection on the\nAqua2 platform.\n","authors":["Xiaomin Lin","Vivek Mange","Arjun Suresh","Bernhard Neuberger","Aadi Palnitkar","Brendan Campbell","Alan Williams","Kleio Baxevani","Jeremy Mallette","Alhim Vera","Markus Vincze","Ioannis Rekleitis","Herbert G. Tanner","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2409.07003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08850v1","updated":"2024-09-13T14:06:12Z","published":"2024-09-13T14:06:12Z","title":"DX2CT: Diffusion Model for 3D CT Reconstruction from Bi or Mono-planar\n 2D X-ray(s)","summary":" Computational tomography (CT) provides high-resolution medical imaging, but\nit can expose patients to high radiation. X-ray scanners have low radiation\nexposure, but their resolutions are low. This paper proposes a new conditional\ndiffusion model, DX2CT, that reconstructs three-dimensional (3D) CT volumes\nfrom bi or mono-planar X-ray image(s). Proposed DX2CT consists of two key\ncomponents: 1) modulating feature maps extracted from two-dimensional (2D)\nX-ray(s) with 3D positions of CT volume using a new transformer and 2)\neffectively using the modulated 3D position-aware feature maps as conditions of\nDX2CT. In particular, the proposed transformer can provide conditions with rich\ninformation of a target CT slice to the conditional diffusion model, enabling\nhigh-quality CT reconstruction. Our experiments with the bi or mono-planar\nX-ray(s) benchmark datasets show that proposed DX2CT outperforms several\nstate-of-the-art methods. Our codes and model will be available at:\nhttps://www.github.com/intyeger/DX2CT.\n","authors":["Yun Su Jeong","Hye Bin Yoo","Il Yong Chun"],"pdf_url":"https://arxiv.org/pdf/2409.08850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08847v1","updated":"2024-09-13T14:05:26Z","published":"2024-09-13T14:05:26Z","title":"Kinect Calibration and Data Optimization For Anthropometric Parameters","summary":" Recently, through development of several 3d vision systems, widely used in\nvarious applications, medical and biometric fields. Microsoft kinect sensor\nhave been most of used camera among 3d vision systems. Microsoft kinect sensor\ncan obtain depth images of a scene and 3d coordinates of human joints. Thus,\nanthropometric features can extractable easily. Anthropometric feature and 3d\njoint coordinate raw datas which captured from kinect sensor is unstable. The\nstrongest reason for this, datas vary by distance between joints of individual\nand location of kinect sensor. Consequently, usage of this datas without kinect\ncalibration and data optimization does not result in sufficient and healthy. In\nthis study, proposed a novel method to calibrating kinect sensor and optimizing\nskeleton features. Results indicate that the proposed method is quite effective\nand worthy of further study in more general scenarios.\n","authors":["M. S. Gokmen","M. Akbaba","O. Findik"],"pdf_url":"https://arxiv.org/pdf/2409.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08840v1","updated":"2024-09-13T13:53:52Z","published":"2024-09-13T13:53:52Z","title":"Direct-CP: Directed Collaborative Perception for Connected and\n Autonomous Vehicles via Proactive Attention","summary":" Collaborative perception (CP) leverages visual data from connected and\nautonomous vehicles (CAV) to enhance an ego vehicle's field of view (FoV).\nDespite recent progress, current CP methods expand the ego vehicle's 360-degree\nperceptual range almost equally, which faces two key challenges. Firstly, in\nareas with uneven traffic distribution, focusing on directions with little\ntraffic offers limited benefits. Secondly, under limited communication budgets,\nallocating excessive bandwidth to less critical directions lowers the\nperception accuracy in more vital areas. To address these issues, we propose\nDirect-CP, a proactive and direction-aware CP system aiming at improving CP in\nspecific directions. Our key idea is to enable an ego vehicle to proactively\nsignal its interested directions and readjust its attention to enhance local\ndirectional CP performance. To achieve this, we first propose an RSU-aided\ndirection masking mechanism that assists an ego vehicle in identifying vital\ndirections. Additionally, we design a direction-aware selective attention\nmodule to wisely aggregate pertinent features based on ego vehicle's\ndirectional priorities, communication budget, and the positional data of CAVs.\nMoreover, we introduce a direction-weighted detection loss (DWLoss) to capture\nthe divergence between directional CP outcomes and the ground truth,\nfacilitating effective model training. Extensive experiments on the V2X-Sim 2.0\ndataset demonstrate that our approach achieves 19.8\\% higher local perception\naccuracy in interested directions and 2.5\\% higher overall perception accuracy\nthan the state-of-the-art methods in collaborative 3D object detection tasks.\n","authors":["Yihang Tao","Senkang Hu","Zhengru Fang","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2409.08840v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2409.08831v1","updated":"2024-09-13T13:47:12Z","published":"2024-09-13T13:47:12Z","title":"Breaking reCAPTCHAv2","summary":" Our work examines the efficacy of employing advanced machine learning methods\nto solve captchas from Google's reCAPTCHAv2 system. We evaluate the\neffectiveness of automated systems in solving captchas by utilizing advanced\nYOLO models for image segmentation and classification. Our main result is that\nwe can solve 100% of the captchas, while previous work only solved 68-71%.\nFurthermore, our findings suggest that there is no significant difference in\nthe number of challenges humans and bots must solve to pass the captchas in\nreCAPTCHAv2. This implies that current AI technologies can exploit advanced\nimage-based captchas. We also look under the hood of reCAPTCHAv2, and find\nevidence that reCAPTCHAv2 is heavily based on cookie and browser history data\nwhen evaluating whether a user is human or not. The code is provided alongside\nthis paper.\n","authors":["Andreas Plesner","Tobias Vontobel","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2409.08831v1.pdf","comment":"10 pages. Accepted at COMPSAC 2024"},{"id":"http://arxiv.org/abs/2404.16571v3","updated":"2024-09-13T13:40:41Z","published":"2024-04-25T12:34:23Z","title":"MonoPCC: Photometric-invariant Cycle Constraint for Monocular Depth\n Estimation of Endoscopic Images","summary":" Photometric constraint is indispensable for self-supervised monocular depth\nestimation. It involves warping a source image onto a target view using\nestimated depth&pose, and then minimizing the difference between the warped and\ntarget images. However, the endoscopic built-in light causes significant\nbrightness fluctuations, and thus makes the photometric constraint unreliable.\nPrevious efforts only mitigate this relying on extra models to calibrate image\nbrightness. In this paper, we propose MonoPCC to address the brightness\ninconsistency radically by reshaping the photometric constraint into a cycle\nform. Instead of only warping the source image, MonoPCC constructs a closed\nloop consisting of two opposite forward-backward warping paths: from target to\nsource and then back to target. Thus, the target image finally receives an\nimage cycle-warped from itself, which naturally makes the constraint invariant\nto brightness changes. Moreover, MonoPCC transplants the source image's\nphase-frequency into the intermediate warped image to avoid structure lost, and\nalso stabilizes the training via an exponential moving average (EMA) strategy\nto avoid frequent changes in the forward warping. The comprehensive and\nextensive experimental results on four endoscopic datasets demonstrate that our\nproposed MonoPCC shows a great robustness to the brightness inconsistency, and\nexceeds other state-of-the-arts by reducing the absolute relative error by at\nleast 7.27%, 9.38%, 9.90% and 3.17%, respectively.\n","authors":["Zhiwei Wang","Ying Zhou","Shiquan He","Ting Li","Fan Huang","Qiang Ding","Xinxia Feng","Mei Liu","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.16571v3.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.08824v1","updated":"2024-09-13T13:37:33Z","published":"2024-09-13T13:37:33Z","title":"Pathfinder for Low-altitude Aircraft with Binary Neural Network","summary":" A prior global topological map (e.g., the OpenStreetMap, OSM) can boost the\nperformance of autonomous mapping by a ground mobile robot. However, the prior\nmap is usually incomplete due to lacking labeling in partial paths. To solve\nthis problem, this paper proposes an OSM maker using airborne sensors carried\nby low-altitude aircraft, where the core of the OSM maker is a novel efficient\npathfinder approach based on LiDAR and camera data, i.e., a binary dual-stream\nroad segmentation model. Specifically, a multi-scale feature extraction based\non the UNet architecture is implemented for images and point clouds. To reduce\nthe effect caused by the sparsity of point cloud, an attention-guided gated\nblock is designed to integrate image and point-cloud features. For enhancing\nthe efficiency of the model, we propose a binarization streamline to each model\ncomponent, including a variant of vision transformer (ViT) architecture as the\nencoder of the image branch, and new focal and perception losses to optimize\nthe model training. The experimental results on two datasets demonstrate that\nour pathfinder method achieves SOTA accuracy with high efficiency in finding\npaths from the low-level airborne sensors, and we can create complete OSM prior\nmaps based on the segmented road skeletons. Code and data are available\nat:https://github.com/IMRL/Pathfinder}{https://github.com/IMRL/Pathfinder.\n","authors":["Kaijie Yin","Tian Gao","Hui Kong"],"pdf_url":"https://arxiv.org/pdf/2409.08824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08800v1","updated":"2024-09-13T13:08:24Z","published":"2024-09-13T13:08:24Z","title":"Task-Specific Data Preparation for Deep Learning to Reconstruct\n Structures of Interest from Severely Truncated CBCT Data","summary":" Cone-beam computed tomography (CBCT) is widely used in interventional\nsurgeries and radiation oncology. Due to the limited size of flat-panel\ndetectors, anatomical structures might be missing outside the limited\nfield-of-view (FOV), which restricts the clinical applications of CBCT systems.\nRecently, deep learning methods have been proposed to extend the FOV for\nmulti-slice CT systems. However, in mobile CBCT system with a smaller FOV size,\nprojection data is severely truncated and it is challenging for a network to\nrestore all missing structures outside the FOV. In some applications, only\ncertain structures outside the FOV are of interest, e.g., ribs in needle path\nplanning for liver/lung cancer diagnosis. Therefore, a task-specific data\npreparation method is proposed in this work, which automatically let the\nnetwork focus on structures of interest instead of all the structures. Our\npreliminary experiment shows that Pix2pixGAN with a conventional training has\nthe risk to reconstruct false positive and false negative rib structures from\nseverely truncated CBCT data, whereas Pix2pixGAN with the proposed\ntask-specific training can reconstruct all the ribs reliably. The proposed\nmethod is promising to empower CBCT with more clinical applications.\n","authors":["Yixing Huang","Fuxin Fan","Ahmed Gomaa","Andreas Maier","Rainer Fietkau","Christoph Bert","Florian Putz"],"pdf_url":"https://arxiv.org/pdf/2409.08800v1.pdf","comment":"Published in the CT-Meeting 2024 proceeding. arXiv admin note: text\n overlap with arXiv:2108.13844"},{"id":"http://arxiv.org/abs/2409.08782v1","updated":"2024-09-13T12:39:57Z","published":"2024-09-13T12:39:57Z","title":"Contactless Fingerprint Recognition Using 3D Graph Matching","summary":" Contactless fingerprint is a newly developed type of fingerprint, and has\ngained lots of attention in recent fingerprint studies. However, most existing\ncontactless fingerprint algorithms treat contactless fingerprints as 2D plain\nfingerprints, and utilize similar recognition methods as traditional\ncontact-based 2D fingerprints. This recognition approach does not consider the\nmodality difference between contactless and contact fingerprints, especially\nthe intrinsic 3D characteristic of contactless fingerprints. This paper\nproposes a novel contactless fingerprint recognition algorithm that captures\nthe revealed 3D feature of contactless fingerprints rather than the plain 2D\nfeature. The proposed method first recovers 3D features from the input\ncontactless fingerprint, including the 3D shape model and 3D fingerprint\nfeature (minutiae, orientation, etc.). Then, a novel 3D graph matching is\nconducted in 3D space according to the extracted 3D feature. Our method\ncaptures the real 3D nature of contactless fingerprints as the whole feature\nextraction and matching algorithms are completed in real 3D space. Experiments\nresults on contactless fingerprint databases show that the proposed method\nsuccessfully improves the matching accuracy of contactless fingerprints.\nExceptionally, our method performs stably across multiple poses of contactless\nfingerprints due to 3D graph matching, which is a great advantage compared to\nprevious contactless fingerprint recognition algorithms.\n","authors":["Zhe Cui","Yuwei Jia","Siyang Zheng","Fei Su"],"pdf_url":"https://arxiv.org/pdf/2409.08782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05752v2","updated":"2024-09-13T12:37:34Z","published":"2024-08-11T11:53:29Z","title":"RTF-Q: Efficient Unsupervised Domain Adaptation with Retraining-free\n Quantization","summary":" Performing unsupervised domain adaptation on resource-constrained edge\ndevices is challenging. Existing research typically adopts architecture\noptimization (e.g., designing slimmable networks) but requires expensive\ntraining costs. Moreover, it does not consider the considerable precision\nredundancy of parameters and activations. To address these limitations, we\npropose efficient unsupervised domain adaptation with ReTraining-Free\nQuantization (RTF-Q). Our approach uses low-precision quantization\narchitectures with varying computational costs, adapting to devices with\ndynamic computation budgets. We subtly configure subnet dimensions and leverage\nweight-sharing to optimize multiple architectures within a single set of\nweights, enabling the use of pre-trained models from open-source repositories.\nAdditionally, we introduce multi-bitwidth joint training and the SandwichQ\nrule, both of which are effective in handling multiple quantization bit-widths\nacross subnets. Experimental results demonstrate that our network achieves\ncompetitive accuracy with state-of-the-art methods across three benchmarks\nwhile significantly reducing memory and computational costs.\n","authors":["Nanyang Du","Chen Tang","Yuxiao Jiang","Yuan Meng","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08772v1","updated":"2024-09-13T12:30:15Z","published":"2024-09-13T12:30:15Z","title":"On the Computation of BD-Rate over a Set of Videos for Fair Assessment\n of Performance of Learned Video Codecs","summary":" The Bj{\\o}ntegaard Delta (BD) measure is widely employed to evaluate and\nquantify the variations in the rate-distortion(RD) performance across different\ncodecs. Many researchers report the average BD value over multiple videos\nwithin a dataset for different codecs. We claim that the current practice in\nthe learned video compression community of computing the average BD value over\na dataset based on the average RD curve of multiple videos can lead to\nmisleading conclusions. We show both by analysis of a simplistic case of linear\nRD curves and experimental results with two recent learned video codecs that\naveraging RD curves can lead to a single video to disproportionately influence\nthe average BD value especially when the operating bitrate range of different\ncodecs do not exactly match. Instead, we advocate for calculating the BD\nmeasure per-video basis, as commonly done by the traditional video compression\ncommunity, followed by averaging the individual BD values over videos, to\nprovide a fair comparison of learned video codecs. Our experimental results\ndemonstrate that the comparison of two recent learned video codecs is affected\nby how we evaluate the average BD measure.\n","authors":["M. Akin Yilmaz","Onur Keleş","A. Murat Tekalp"],"pdf_url":"https://arxiv.org/pdf/2409.08772v1.pdf","comment":"Submitted to IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2401.16318v2","updated":"2024-09-13T12:27:44Z","published":"2024-01-29T17:21:41Z","title":"Defining and Extracting generalizable interaction primitives from DNNs","summary":" Faithfully summarizing the knowledge encoded by a deep neural network (DNN)\ninto a few symbolic primitive patterns without losing much information\nrepresents a core challenge in explainable AI. To this end, Ren et al. (2024)\nhave derived a series of theorems to prove that the inference score of a DNN\ncan be explained as a small set of interactions between input variables.\nHowever, the lack of generalization power makes it still hard to consider such\ninteractions as faithful primitive patterns encoded by the DNN. Therefore,\ngiven different DNNs trained for the same task, we develop a new method to\nextract interactions that are shared by these DNNs. Experiments show that the\nextracted interactions can better reflect common knowledge shared by different\nDNNs.\n","authors":["Lu Chen","Siyu Lou","Benhao Huang","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.16318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17323v2","updated":"2024-09-13T12:25:58Z","published":"2024-06-25T07:14:15Z","title":"XAMI -- A Benchmark Dataset for Artefact Detection in XMM-Newton Optical\n Images","summary":" Reflected or scattered light produce artefacts in astronomical observations\nthat can negatively impact the scientific study. Hence, automated detection of\nthese artefacts is highly beneficial, especially with the increasing amounts of\ndata gathered. Machine learning methods are well-suited to this problem, but\ncurrently there is a lack of annotated data to train such approaches to detect\nartefacts in astronomical observations. In this work, we present a dataset of\nimages from the XMM-Newton space telescope Optical Monitoring camera showing\ndifferent types of artefacts. We hand-annotated a sample of 1000 images with\nartefacts which we use to train automated ML methods. We further demonstrate\ntechniques tailored for accurate detection and masking of artefacts using\ninstance segmentation. We adopt a hybrid approach, combining knowledge from\nboth convolutional neural networks (CNNs) and transformer-based models and use\ntheir advantages in segmentation. The presented method and dataset will advance\nartefact detection in astronomical observations by providing a reproducible\nbaseline. All code and data are made available\n(https://github.com/ESA-Datalabs/XAMI-model and\nhttps://github.com/ESA-Datalabs/XAMI-dataset).\n","authors":["Elisabeta-Iulia Dima","Pablo Gómez","Sandor Kruk","Peter Kretschmar","Simon Rosen","Călin-Adrian Popa"],"pdf_url":"https://arxiv.org/pdf/2406.17323v2.pdf","comment":"Accepted for oral presentation at SPAICE 2024"},{"id":"http://arxiv.org/abs/2409.08769v1","updated":"2024-09-13T12:21:25Z","published":"2024-09-13T12:21:25Z","title":"Causal Transformer for Fusion and Pose Estimation in Deep Visual\n Inertial Odometry","summary":" In recent years, transformer-based architectures become the de facto standard\nfor sequence modeling in deep learning frameworks. Inspired by the successful\nexamples, we propose a causal visual-inertial fusion transformer (VIFT) for\npose estimation in deep visual-inertial odometry. This study aims to improve\npose estimation accuracy by leveraging the attention mechanisms in\ntransformers, which better utilize historical data compared to the recurrent\nneural network (RNN) based methods seen in recent methods. Transformers\ntypically require large-scale data for training. To address this issue, we\nutilize inductive biases for deep VIO networks. Since latent visual-inertial\nfeature vectors encompass essential information for pose estimation, we employ\ntransformers to refine pose estimates by updating latent vectors temporally.\nOur study also examines the impact of data imbalance and rotation learning\nmethods in supervised end-to-end learning of visual inertial odometry by\nutilizing specialized gradients in backpropagation for the elements of SE$(3)$\ngroup. The proposed method is end-to-end trainable and requires only a\nmonocular camera and IMU during inference. Experimental results demonstrate\nthat VIFT increases the accuracy of monocular VIO networks, achieving\nstate-of-the-art results when compared to previous methods on the KITTI\ndataset. The code will be made available at https://github.com/ybkurt/VIFT.\n","authors":["Yunus Bilge Kurt","Ahmet Akman","A. Aydın Alatan"],"pdf_url":"https://arxiv.org/pdf/2409.08769v1.pdf","comment":"Accepted to ECCV 2024 2nd Workshop on Vision-Centric Autonomous\n Driving (VCAD)"},{"id":"http://arxiv.org/abs/2409.08744v1","updated":"2024-09-13T11:52:16Z","published":"2024-09-13T11:52:16Z","title":"Uncertainty and Generalizability in Foundation Models for Earth\n Observation","summary":" We take the perspective in which we want to design a downstream task (such as\nestimating vegetation coverage) on a certain area of interest (AOI) with a\nlimited labeling budget. By leveraging an existing Foundation Model (FM) we\nmust decide whether we train a downstream model on a different but label-rich\nAOI hoping it generalizes to our AOI, or we split labels in our AOI for\ntraining and validating. In either case, we face choices concerning what FM to\nuse, how to sample our AOI for labeling, etc. which affect both the performance\nand uncertainty of the results. In this work, we perform a large ablative study\nusing eight existing FMs on either Sentinel 1 or Sentinel 2 as input data, and\nthe classes from the ESA World Cover product as downstream tasks across eleven\nAOIs. We do repeated sampling and training, resulting in an ablation of some\n500K simple linear regression models. Our results show both the limits of\nspatial generalizability across AOIs and the power of FMs where we are able to\nget over 0.9 correlation coefficient between predictions and targets on\ndifferent chip level predictive tasks. And still, performance and uncertainty\nvary greatly across AOIs, tasks and FMs. We believe this is a key issue in\npractice, because there are many design decisions behind each FM and downstream\ntask (input modalities, sampling, architectures, pretraining, etc.) and usually\na downstream task designer is aware of and can decide upon a few of them.\nThrough this work, we advocate for the usage of the methodology herein\ndescribed (large ablations on reference global labels and simple probes), both\nwhen publishing new FMs, and to make informed decisions when designing\ndownstream tasks to use them.\n","authors":["Raul Ramos-Pollan","Freddie Kalaitzis","Karthick Panner Selvam"],"pdf_url":"https://arxiv.org/pdf/2409.08744v1.pdf","comment":"A large ablation study measuring uncertainty and spatial\n generalizability with 8 foundation models, 11 world regions and 7 downstream\n tasks"},{"id":"http://arxiv.org/abs/2406.18140v3","updated":"2024-09-13T11:36:49Z","published":"2024-06-26T07:44:27Z","title":"Exclusive Style Removal for Cross Domain Novel Class Discovery","summary":" As a promising field in open-world learning, \\textit{Novel Class Discovery}\n(NCD) is usually a task to cluster unseen novel classes in an unlabeled set\nbased on the prior knowledge of labeled data within the same domain. However,\nthe performance of existing NCD methods could be severely compromised when\nnovel classes are sampled from a different distribution with the labeled ones.\nIn this paper, we explore and establish the solvability of NCD in cross domain\nsetting with the necessary condition that style information must be removed.\nBased on the theoretical analysis, we introduce an exclusive style removal\nmodule for extracting style information that is distinctive from the baseline\nfeatures, thereby facilitating inference. Moreover, this module is easy to\nintegrate with other NCD methods, acting as a plug-in to improve performance on\nnovel classes with different distributions compared to the seen labeled set.\nAdditionally, recognizing the non-negligible influence of different backbones\nand pre-training strategies on the performance of the NCD methods, we build a\nfair benchmark for future NCD research. Extensive experiments on three common\ndatasets demonstrate the effectiveness of our proposed module.\n","authors":["Yicheng Wang","Feng Liu","Junmin Liu","Kai Sun"],"pdf_url":"https://arxiv.org/pdf/2406.18140v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12568v2","updated":"2024-09-13T11:31:05Z","published":"2024-07-17T13:51:49Z","title":"LTRL: Boosting Long-tail Recognition via Reflective Learning","summary":" In real-world scenarios, where knowledge distributions exhibit long-tail.\nHumans manage to master knowledge uniformly across imbalanced distributions, a\nfeat attributed to their diligent practices of reviewing, summarizing, and\ncorrecting errors. Motivated by this learning process, we propose a novel\nlearning paradigm, called reflecting learning, in handling long-tail\nrecognition. Our method integrates three processes for reviewing past\npredictions during training, summarizing and leveraging the feature relation\nacross classes, and correcting gradient conflict for loss functions. These\ndesigns are lightweight enough to plug and play with existing long-tail\nlearning methods, achieving state-of-the-art performance in popular long-tail\nvisual benchmarks. The experimental results highlight the great potential of\nreflecting learning in dealing with long-tail recognition.\n","authors":["Qihao Zhao","Yalun Dai","Shen Lin","Wei Hu","Fan Zhang","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2407.12568v2.pdf","comment":"ECCV2024, Oral"},{"id":"http://arxiv.org/abs/2407.18456v2","updated":"2024-09-13T11:12:00Z","published":"2024-07-26T01:42:31Z","title":"Diffusion-driven lensless fiber endomicroscopic quantitative phase\n imaging towards digital pathology","summary":" Lensless fiber endomicroscope is an emerging tool for in-vivo microscopic\nimaging, where quantitative phase imaging (QPI) can be utilized as a label-free\nmethod to enhance image contrast. However, existing single-shot phase\nreconstruction methods through lensless fiber endomicroscope typically perform\nwell on simple images but struggle with complex microscopic structures. Here,\nwe propose a speckle-conditioned diffusion model (SpecDiffusion), which\nreconstructs phase images directly from speckles captured at the detection side\nof a multi-core fiber (MCF). Unlike conventional neural networks, SpecDiffusion\nemploys iterative phase denoising steps for speckle-driven phase\nreconstruction. The iteration scheme allows SpecDiffusion to break down the\nphase reconstruction process into multiple steps, gradually building up to the\nfinal phase image. This attribute alleviates the computation challenge at each\nstep and enables the reconstruction of rich details in complex microscopic\nimages. To validate its efficacy, we build an optical system to capture\nspeckles from MCF and construct a dataset consisting of 100,000 paired images.\nSpecDiffusion provides high-fidelity phase reconstruction results and shows\npowerful generalization capacity for unseen objects, such as test charts and\nbiological tissues, reducing the average mean absolute error of the\nreconstructed tissue images by 7 times. Furthermore, the reconstructed tissue\nimages using SpecDiffusion shows higher accuracy in zero-shot cell segmentation\ntasks compared to the conventional method, demonstrating the potential for\nfurther cell morphology analysis through the learning-based lensless fiber\nendomicroscope. SpecDiffusion offers a precise and generalized method to phase\nreconstruction through scattering media, including MCFs, opening new\nperspective in lensless fiber endomicroscopic imaging.\n","authors":["Zhaoqing Chen","Jiawei Sun","Xinyi Ye","Bin Zhao","Xuelong Li","Juergen Czarske"],"pdf_url":"https://arxiv.org/pdf/2407.18456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08712v1","updated":"2024-09-13T10:59:24Z","published":"2024-09-13T10:59:24Z","title":"Layerwise Change of Knowledge in Neural Networks","summary":" This paper aims to explain how a deep neural network (DNN) gradually extracts\nnew knowledge and forgets noisy features through layers in forward propagation.\nUp to now, although the definition of knowledge encoded by the DNN has not\nreached a consensus, Previous studies have derived a series of mathematical\nevidence to take interactions as symbolic primitive inference patterns encoded\nby a DNN. We extend the definition of interactions and, for the first time,\nextract interactions encoded by intermediate layers. We quantify and track the\nnewly emerged interactions and the forgotten interactions in each layer during\nthe forward propagation, which shed new light on the learning behavior of DNNs.\nThe layer-wise change of interactions also reveals the change of the\ngeneralization capacity and instability of feature representations of a DNN.\n","authors":["Xu Cheng","Lei Cheng","Zhaoran Peng","Yang Xu","Tian Han","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.08712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15589v2","updated":"2024-09-13T10:47:25Z","published":"2024-07-22T12:26:08Z","title":"Exploring the Effectiveness of Object-Centric Representations in Visual\n Question Answering: Comparative Insights with Foundation Models","summary":" Object-centric (OC) representations, which represent the state of a visual\nscene by modeling it as a composition of objects, have the potential to be used\nin various downstream tasks to achieve systematic compositional generalization\nand facilitate reasoning. However, these claims have not been thoroughly\nanalyzed yet. Recently, foundation models have demonstrated unparalleled\ncapabilities across diverse domains from language to computer vision, marking\nthem as a potential cornerstone of future research for a multitude of\ncomputational tasks. In this paper, we conduct an extensive empirical study on\nrepresentation learning for downstream Visual Question Answering (VQA), which\nrequires an accurate compositional understanding of the scene. We thoroughly\ninvestigate the benefits and trade-offs of OC models and alternative approaches\nincluding large pre-trained foundation models on both synthetic and real-world\ndata, and demonstrate a viable way to achieve the best of both worlds. The\nextensiveness of our study, encompassing over 800 downstream VQA models and 15\ndifferent types of upstream representations, also provides several additional\ninsights that we believe will be of interest to the community at large.\n","authors":["Amir Mohammad Karimi Mamaghan","Samuele Papa","Karl Henrik Johansson","Stefan Bauer","Andrea Dittadi"],"pdf_url":"https://arxiv.org/pdf/2407.15589v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08695v1","updated":"2024-09-13T10:27:27Z","published":"2024-09-13T10:27:27Z","title":"Precision Aquaculture: An Integrated Computer Vision and IoT Approach\n for Optimized Tilapia Feeding","summary":" Traditional fish farming practices often lead to inefficient feeding,\nresulting in environmental issues and reduced productivity. We developed an\ninnovative system combining computer vision and IoT technologies for precise\nTilapia feeding. Our solution uses real-time IoT sensors to monitor water\nquality parameters and computer vision algorithms to analyze fish size and\ncount, determining optimal feed amounts. A mobile app enables remote monitoring\nand control. We utilized YOLOv8 for keypoint detection to measure Tilapia\nweight from length, achieving \\textbf{94\\%} precision on 3,500 annotated\nimages. Pixel-based measurements were converted to centimeters using depth\nestimation for accurate feeding calculations. Our method, with data collection\nmirroring inference conditions, significantly improved results. Preliminary\nestimates suggest this approach could increase production up to 58 times\ncompared to traditional farms. Our models, code, and dataset are\nopen-source~\\footnote{The code, dataset, and models are available upon\nreasonable request.\n","authors":["Rania Hossam","Ahmed Heakl","Walid Gomaa"],"pdf_url":"https://arxiv.org/pdf/2409.08695v1.pdf","comment":"8 pages, 6 figures, 3 tables, 21th International Conference on\n Informatics in Control, Automation, and Robotics"},{"id":"http://arxiv.org/abs/2409.08691v1","updated":"2024-09-13T10:19:10Z","published":"2024-09-13T10:19:10Z","title":"Autoregressive Sequence Modeling for 3D Medical Image Representation","summary":" Three-dimensional (3D) medical images, such as Computed Tomography (CT) and\nMagnetic Resonance Imaging (MRI), are essential for clinical applications.\nHowever, the need for diverse and comprehensive representations is particularly\npronounced when considering the variability across different organs, diagnostic\ntasks, and imaging modalities. How to effectively interpret the intricate\ncontextual information and extract meaningful insights from these images\nremains an open challenge to the community. While current self-supervised\nlearning methods have shown potential, they often consider an image as a whole\nthereby overlooking the extensive, complex relationships among local regions\nfrom one or multiple images. In this work, we introduce a pioneering method for\nlearning 3D medical image representations through an autoregressive\npre-training framework. Our approach sequences various 3D medical images based\non spatial, contrast, and semantic correlations, treating them as\ninterconnected visual tokens within a token sequence. By employing an\nautoregressive sequence modeling task, we predict the next visual token in the\nsequence, which allows our model to deeply understand and integrate the\ncontextual information inherent in 3D medical images. Additionally, we\nimplement a random startup strategy to avoid overestimating token relationships\nand to enhance the robustness of learning. The effectiveness of our approach is\ndemonstrated by the superior performance over others on nine downstream tasks\nin public datasets.\n","authors":["Siwen Wang","Churan Wang","Fei Gao","Lixian Su","Fandong Zhang","Yizhou Wang","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2409.08691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08688v1","updated":"2024-09-13T10:15:28Z","published":"2024-09-13T10:15:28Z","title":"GenMapping: Unleashing the Potential of Inverse Perspective Mapping for\n Robust Online HD Map Construction","summary":" Online High-Definition (HD) maps have emerged as the preferred option for\nautonomous driving, overshadowing the counterpart offline HD maps due to\nflexible update capability and lower maintenance costs. However, contemporary\nonline HD map models embed parameters of visual sensors into training,\nresulting in a significant decrease in generalization performance when applied\nto visual sensors with different parameters. Inspired by the inherent potential\nof Inverse Perspective Mapping (IPM), where camera parameters are decoupled\nfrom the training process, we have designed a universal map generation\nframework, GenMapping. The framework is established with a triadic synergy\narchitecture, including principal and dual auxiliary branches. When faced with\na coarse road image with local distortion translated via IPM, the principal\nbranch learns robust global features under the state space models. The two\nauxiliary branches are a dense perspective branch and a sparse prior branch.\nThe former exploits the correlation information between static and moving\nobjects, whereas the latter introduces the prior knowledge of OpenStreetMap\n(OSM). The triple-enhanced merging module is crafted to synergistically\nintegrate the unique spatial features from all three branches. To further\nimprove generalization capabilities, a Cross-View Map Learning (CVML) scheme is\nleveraged to realize joint learning within the common space. Additionally, a\nBidirectional Data Augmentation (BiDA) module is introduced to mitigate\nreliance on datasets concurrently. A thorough array of experimental results\nshows that the proposed model surpasses current state-of-the-art methods in\nboth semantic mapping and vectorized mapping, while also maintaining a rapid\ninference speed. The source code will be publicly available at\nhttps://github.com/lynn-yu/GenMapping.\n","authors":["Siyu Li","Kailun Yang","Hao Shi","Song Wang","You Yao","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2409.08688v1.pdf","comment":"The source code will be publicly available at\n https://github.com/lynn-yu/GenMapping"},{"id":"http://arxiv.org/abs/2212.14181v2","updated":"2024-09-13T09:55:48Z","published":"2022-12-29T05:57:29Z","title":"Efficient Image Super-Resolution with Feature Interaction Weighted\n Hybrid Network","summary":" Lightweight image super-resolution aims to reconstruct high-resolution images\nfrom low-resolution images using low computational costs. However, existing\nmethods result in the loss of middle-layer features due to activation\nfunctions. To minimize the impact of intermediate feature loss on\nreconstruction quality, we propose a Feature Interaction Weighted Hybrid\nNetwork (FIWHN), which comprises a series of Wide-residual Distillation\nInteraction Block (WDIB) as the backbone. Every third WDIB forms a Feature\nShuffle Weighted Group (FSWG) by applying mutual information shuffle and\nfusion. Moreover, to mitigate the negative effects of intermediate feature\nloss, we introduce Wide Residual Weighting units within WDIB. These units\neffectively fuse features of varying levels of detail through a Wide-residual\nDistillation Connection (WRDC) and a Self-Calibrating Fusion (SCF). To\ncompensate for global feature deficiencies, we incorporate a Transformer and\nexplore a novel architecture to combine CNN and Transformer. We show that our\nFIWHN achieves a favorable balance between performance and efficiency through\nextensive experiments on low-level and high-level tasks. Codes will be\navailable at \\url{https://github.com/IVIPLab/FIWHN}.\n","authors":["Wenjie Li","Juncheng Li","Guangwei Gao","Weihong Deng","Jian Yang","Guo-Jun Qi","Chia-Wen Lin"],"pdf_url":"https://arxiv.org/pdf/2212.14181v2.pdf","comment":"12 pages, 12 figures, IEEE Transactions on Multimedia (extention of\n our AAAI2022)"},{"id":"http://arxiv.org/abs/2406.01829v2","updated":"2024-09-13T09:39:53Z","published":"2024-06-03T22:56:40Z","title":"FaçAID: A Transformer Model for Neuro-Symbolic Facade Reconstruction","summary":" We introduce a neuro-symbolic transformer-based model that converts flat,\nsegmented facade structures into procedural definitions using a custom-designed\nsplit grammar. To facilitate this, we first develop a semi-complex split\ngrammar tailored for architectural facades and then generate a dataset\ncomprising of facades alongside their corresponding procedural representations.\nThis dataset is used to train our transformer model to convert segmented, flat\nfacades into the procedural language of our grammar. During inference, the\nmodel applies this learned transformation to new facade segmentations,\nproviding a procedural representation that users can adjust to generate varied\nfacade designs. This method not only automates the conversion of static facade\nimages into dynamic, editable procedural formats but also enhances the design\nflexibility, allowing for easy modifications.\n","authors":["Aleksander Plocharski","Jan Swidzinski","Joanna Porter-Sobieraj","Przemyslaw Musialski"],"pdf_url":"https://arxiv.org/pdf/2406.01829v2.pdf","comment":"11 pages, 11 figures, in ACM SIGGRAPH Asia 2024 Conference Papers\n Proceedings"},{"id":"http://arxiv.org/abs/2407.13863v4","updated":"2024-09-13T09:36:36Z","published":"2024-07-18T19:16:22Z","title":"A Closer Look at GAN Priors: Exploiting Intermediate Features for\n Enhanced Model Inversion Attacks","summary":" Model Inversion (MI) attacks aim to reconstruct privacy-sensitive training\ndata from released models by utilizing output information, raising extensive\nconcerns about the security of Deep Neural Networks (DNNs). Recent advances in\ngenerative adversarial networks (GANs) have contributed significantly to the\nimproved performance of MI attacks due to their powerful ability to generate\nrealistic images with high fidelity and appropriate semantics. However,\nprevious MI attacks have solely disclosed private information in the latent\nspace of GAN priors, limiting their semantic extraction and transferability\nacross multiple target models and datasets. To address this challenge, we\npropose a novel method, Intermediate Features enhanced Generative Model\nInversion (IF-GMI), which disassembles the GAN structure and exploits features\nbetween intermediate blocks. This allows us to extend the optimization space\nfrom latent code to intermediate features with enhanced expressive\ncapabilities. To prevent GAN priors from generating unrealistic images, we\napply a L1 ball constraint to the optimization process. Experiments on multiple\nbenchmarks demonstrate that our method significantly outperforms previous\napproaches and achieves state-of-the-art results under various settings,\nespecially in the out-of-distribution (OOD) scenario. Our code is available at:\nhttps://github.com/final-solution/IF-GMI\n","authors":["Yixiang Qiu","Hao Fang","Hongyao Yu","Bin Chen","MeiKang Qiu","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2407.13863v4.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.08669v1","updated":"2024-09-13T09:32:38Z","published":"2024-09-13T09:32:38Z","title":"AdR-Gaussian: Accelerating Gaussian Splatting with Adaptive Radius","summary":" 3D Gaussian Splatting (3DGS) is a recent explicit 3D representation that has\nachieved high-quality reconstruction and real-time rendering of complex scenes.\nHowever, the rasterization pipeline still suffers from unnecessary overhead\nresulting from avoidable serial Gaussian culling, and uneven load due to the\ndistinct number of Gaussian to be rendered across pixels, which hinders wider\npromotion and application of 3DGS. In order to accelerate Gaussian splatting,\nwe propose AdR-Gaussian, which moves part of serial culling in Render stage\ninto the earlier Preprocess stage to enable parallel culling, employing\nadaptive radius to narrow the rendering pixel range for each Gaussian, and\nintroduces a load balancing method to minimize thread waiting time during the\npixel-parallel rendering. Our contributions are threefold, achieving a\nrendering speed of 310% while maintaining equivalent or even better quality\nthan the state-of-the-art. Firstly, we propose to early cull Gaussian-Tile\npairs of low splatting opacity based on an adaptive radius in the\nGaussian-parallel Preprocess stage, which reduces the number of affected tile\nthrough the Gaussian bounding circle, thus reducing unnecessary overhead and\nachieving faster rendering speed. Secondly, we further propose early culling\nbased on axis-aligned bounding box for Gaussian splatting, which achieves a\nmore significant reduction in ineffective expenses by accurately calculating\nthe Gaussian size in the 2D directions. Thirdly, we propose a balancing\nalgorithm for pixel thread load, which compresses the information of heavy-load\npixels to reduce thread waiting time, and enhance information of light-load\npixels to hedge against rendering quality loss. Experiments on three datasets\ndemonstrate that our algorithm can significantly improve the Gaussian Splatting\nrendering speed.\n","authors":["Xinzhe Wang","Ran Yi","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2409.08669v1.pdf","comment":"SIGGRAPH Asia 2024 Conference Papers (SA Conference Papers '24),\n December 03-06, 2024, Tokyo, Japan"},{"id":"http://arxiv.org/abs/2409.08667v1","updated":"2024-09-13T09:30:19Z","published":"2024-09-13T09:30:19Z","title":"Test-time Training for Hyperspectral Image Super-resolution","summary":" The progress on Hyperspectral image (HSI) super-resolution (SR) is still\nlagging behind the research of RGB image SR. HSIs usually have a high number of\nspectral bands, so accurately modeling spectral band interaction for HSI SR is\nhard. Also, training data for HSI SR is hard to obtain so the dataset is\nusually rather small. In this work, we propose a new test-time training method\nto tackle this problem. Specifically, a novel self-training framework is\ndeveloped, where more accurate pseudo-labels and more accurate LR-HR\nrelationships are generated so that the model can be further trained with them\nto improve performance. In order to better support our test-time training\nmethod, we also propose a new network architecture to learn HSI SR without\nmodeling spectral band interaction and propose a new data augmentation method\nSpectral Mixup to increase the diversity of the training data at test time. We\nalso collect a new HSI dataset with a diverse set of images of interesting\nobjects ranging from food to vegetation, to materials, and to general scenes.\nExtensive experiments on multiple datasets show that our method can improve the\nperformance of pre-trained models significantly after test-time training and\noutperform competing methods significantly for HSI SR.\n","authors":["Ke Li","Luc Van Gool","Dengxin Dai"],"pdf_url":"https://arxiv.org/pdf/2409.08667v1.pdf","comment":"Accepted to T-PAMI"},{"id":"http://arxiv.org/abs/2305.01939v2","updated":"2024-09-13T09:22:38Z","published":"2023-05-03T07:32:28Z","title":"Where We Have Arrived in Proving the Emergence of Sparse Symbolic\n Concepts in AI Models","summary":" This study aims to prove the emergence of symbolic concepts (or more\nprecisely, sparse primitive inference patterns) in well-trained deep neural\nnetworks (DNNs). Specifically, we prove the following three conditions for the\nemergence. (i) The high-order derivatives of the network output with respect to\nthe input variables are all zero. (ii) The DNN can be used on occluded samples\nand when the input sample is less occluded, the DNN will yield higher\nconfidence. (iii) The confidence of the DNN does not significantly degrade on\noccluded samples. These conditions are quite common, and we prove that under\nthese conditions, the DNN will only encode a relatively small number of sparse\ninteractions between input variables. Moreover, we can consider such\ninteractions as symbolic primitive inference patterns encoded by a DNN, because\nwe show that inference scores of the DNN on an exponentially large number of\nrandomly masked samples can always be well mimicked by numerical effects of\njust a few interactions.\n","authors":["Qihan Ren","Jiayang Gao","Wen Shen","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.01939v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13091v2","updated":"2024-09-13T09:19:14Z","published":"2023-02-25T14:44:40Z","title":"Explaining Generalization Power of a DNN Using Interactive Concepts","summary":" This paper explains the generalization power of a deep neural network (DNN)\nfrom the perspective of interactions. Although there is no universally accepted\ndefinition of the concepts encoded by a DNN, the sparsity of interactions in a\nDNN has been proved, i.e., the output score of a DNN can be well explained by a\nsmall number of interactions between input variables. In this way, to some\nextent, we can consider such interactions as interactive concepts encoded by\nthe DNN. Therefore, in this paper, we derive an analytic explanation of\ninconsistency of concepts of different complexities. This may shed new lights\non using the generalization power of concepts to explain the generalization\npower of the entire DNN. Besides, we discover that the DNN with stronger\ngeneralization power usually learns simple concepts more quickly and encodes\nfewer complex concepts. We also discover the detouring dynamics of learning\ncomplex concepts, which explains both the high learning difficulty and the low\ngeneralization power of complex concepts. The code will be released when the\npaper is accepted.\n","authors":["Huilin Zhou","Hao Zhang","Huiqi Deng","Dongrui Liu","Wen Shen","Shih-Han Chan","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08652v1","updated":"2024-09-13T09:11:52Z","published":"2024-09-13T09:11:52Z","title":"SkinFormer: Learning Statistical Texture Representation with Transformer\n for Skin Lesion Segmentation","summary":" Accurate skin lesion segmentation from dermoscopic images is of great\nimportance for skin cancer diagnosis. However, automatic segmentation of\nmelanoma remains a challenging task because it is difficult to incorporate\nuseful texture representations into the learning process. Texture\nrepresentations are not only related to the local structural information\nlearned by CNN, but also include the global statistical texture information of\nthe input image. In this paper, we propose a trans\\textbf{Former} network\n(\\textbf{SkinFormer}) that efficiently extracts and fuses statistical texture\nrepresentation for \\textbf{Skin} lesion segmentation. Specifically, to quantify\nthe statistical texture of input features, a Kurtosis-guided Statistical\nCounting Operator is designed. We propose Statistical Texture Fusion\nTransformer and Statistical Texture Enhance Transformer with the help of\nKurtosis-guided Statistical Counting Operator by utilizing the transformer's\nglobal attention mechanism. The former fuses structural texture information and\nstatistical texture information, and the latter enhances the statistical\ntexture of multi-scale features. {Extensive experiments on three publicly\navailable skin lesion datasets validate that our SkinFormer outperforms other\nSOAT methods, and our method achieves 93.2\\% Dice score on ISIC 2018. It can be\neasy to extend SkinFormer to segment 3D images in the future.} Our code is\navailable at https://github.com/Rongtao-Xu/SkinFormer.\n","authors":["Rongtao Xu","Changwei Wang","Jiguang Zhang","Shibiao Xu","Weiliang Meng","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.08652v1.pdf","comment":"12 pages, 8 figures, published to JBHI"},{"id":"http://arxiv.org/abs/2409.07961v2","updated":"2024-09-13T08:37:39Z","published":"2024-09-12T11:42:40Z","title":"Estimating Atmospheric Variables from Digital Typhoon Satellite Images\n via Conditional Denoising Diffusion Models","summary":" This study explores the application of diffusion models in the field of\ntyphoons, predicting multiple ERA5 meteorological variables simultaneously from\nDigital Typhoon satellite images. The focus of this study is taken to be\nTaiwan, an area very vulnerable to typhoons. By comparing the performance of\nConditional Denoising Diffusion Probability Model (CDDPM) with Convolutional\nNeural Networks (CNN) and Squeeze-and-Excitation Networks (SENet), results\nsuggest that the CDDPM performs best in generating accurate and realistic\nmeteorological data. Specifically, CDDPM achieved a PSNR of 32.807, which is\napproximately 7.9% higher than CNN and 5.5% higher than SENet. Furthermore,\nCDDPM recorded an RMSE of 0.032, showing a 11.1% improvement over CNN and 8.6%\nimprovement over SENet. A key application of this research can be for\nimputation purposes in missing meteorological datasets and generate additional\nhigh-quality meteorological data using satellite images. It is hoped that the\nresults of this analysis will enable more robust and detailed forecasting,\nreducing the impact of severe weather events on vulnerable regions. Code\naccessible at https://github.com/TammyLing/Typhoon-forecasting.\n","authors":["Zhangyue Ling","Pritthijit Nath","César Quilodrán-Casas"],"pdf_url":"https://arxiv.org/pdf/2409.07961v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.07319v2","updated":"2024-09-13T08:30:38Z","published":"2024-03-12T05:06:07Z","title":"Efficient Diffusion Model for Image Restoration by Residual Shifting","summary":" While diffusion-based image restoration (IR) methods have achieved remarkable\nsuccess, they are still limited by the low inference speed attributed to the\nnecessity of executing hundreds or even thousands of sampling steps. Existing\nacceleration sampling techniques, though seeking to expedite the process,\ninevitably sacrifice performance to some extent, resulting in over-blurry\nrestored outcomes. To address this issue, this study proposes a novel and\nefficient diffusion model for IR that significantly reduces the required number\nof diffusion steps. Our method avoids the need for post-acceleration during\ninference, thereby avoiding the associated performance deterioration.\nSpecifically, our proposed method establishes a Markov chain that facilitates\nthe transitions between the high-quality and low-quality images by shifting\ntheir residuals, substantially improving the transition efficiency. A carefully\nformulated noise schedule is devised to flexibly control the shifting speed and\nthe noise strength during the diffusion process. Extensive experimental\nevaluations demonstrate that the proposed method achieves superior or\ncomparable performance to current state-of-the-art methods on three classical\nIR tasks, namely image super-resolution, image inpainting, and blind face\nrestoration, \\textit{\\textbf{even only with four sampling steps}}. Our code and\nmodel are publicly available at \\url{https://github.com/zsyOAOA/ResShift}.\n","authors":["Zongsheng Yue","Jianyi Wang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2403.07319v2.pdf","comment":"Accepted by TPAMI@2024. Code: https://github.com/zsyOAOA/ResShift"},{"id":"http://arxiv.org/abs/2409.08619v1","updated":"2024-09-13T08:17:51Z","published":"2024-09-13T08:17:51Z","title":"Joint image reconstruction and segmentation of real-time cardiac MRI in\n free-breathing using a model based on disentangled representation learning","summary":" A joint image reconstruction and segmentation approach based on disentangled\nrepresentation learning was trained to enable cardiac cine MR imaging in\nreal-time and under free-breathing. An exploratory feasibility study tested the\nproposed method in undersampled real-time acquisitions based on an in-house\ndeveloped spiral bSSFP pulse sequence in eight healthy participants and five\npatients with intermittent atrial fibrillation. Images and predicted LV\nsegmentations were compared to the reference standard of ECG-gated segmented\nCartesian cine in repeated breath-holds and corresponding manual segmentation.\nOn a 5-point Likert scale, image quality of the real-time breath-hold approach\nand Cartesian cine was comparable in healthy participants (RT-BH: 1.99 $\\pm$\n.98, Cartesian: 1.94 $\\pm$ .86, p=.052), but slightly inferior in\nfree-breathing (RT-FB: 2.40 $\\pm$ .98, p<.001). In patients with arrhythmia,\nimage quality from both real-time approaches was favourable (RT-BH: 2.10 $\\pm$\n1.28, p<.001, RT-FB: 2.40 $\\pm$ 1.13, p<.001, Cartesian: 2.68 $\\pm$ 1.13).\nIntra-observer reliability was good (ICC=.77, 95%-confidence interval [.75,\n.79], p<.001). In functional analysis, a positive bias was observed for\nejection fractions derived from the proposed model compared to the clinical\nreference standard (RT-BH mean EF: 58.5 $\\pm$ 5.6%, bias: +3.47%,\n95%-confidence interval [-.86, 7.79%], RT-FB mean: 57.9 $\\pm$ 10.6%, bias:\n+1.45%, [-3.02, 5.91%], Cartesian mean: 54.9 $\\pm$ 6.7%). The introduced\nreal-time MR imaging technique is capable of acquiring high-quality cardiac\ncine data in 1-2 minutes without the need for ECG gating and breath-holds. It\nthus offers a promising alternative to the current clinical practice of\nsegmented acquisition, with shorter scan times, higher patient comfort and\nincreased robustness to arrhythmia and patient incompliance.\n","authors":["Tobias Wech","Oliver Schad","Simon Sauer","Jonas Kleineisel","Nils Petri","Peter Nordbeck","Thorsten A. Bley","Bettina Baeßler","Bernhard Petritsch","Julius F. Heidenreich"],"pdf_url":"https://arxiv.org/pdf/2409.08619v1.pdf","comment":"Submitted to the Journal of Cardiovascular Magnetic Resonance"},{"id":"http://arxiv.org/abs/2409.08618v1","updated":"2024-09-13T08:17:15Z","published":"2024-09-13T08:17:15Z","title":"TapToTab : Video-Based Guitar Tabs Generation using AI and Audio\n Analysis","summary":" The automation of guitar tablature generation from video inputs holds\nsignificant promise for enhancing music education, transcription accuracy, and\nperformance analysis. Existing methods face challenges with consistency and\ncompleteness, particularly in detecting fretboards and accurately identifying\nnotes. To address these issues, this paper introduces an advanced approach\nleveraging deep learning, specifically YOLO models for real-time fretboard\ndetection, and Fourier Transform-based audio analysis for precise note\nidentification. Experimental results demonstrate substantial improvements in\ndetection accuracy and robustness compared to traditional techniques. This\npaper outlines the development, implementation, and evaluation of these\nmethodologies, aiming to revolutionize guitar instruction by automating the\ncreation of guitar tabs from video recordings.\n","authors":["Ali Ghaleb","Eslam ElSadawy","Ihab Essam","Mohamed Abdelhakim","Seif-Eldin Zaki","Natalie Fahim","Razan Bayoumi","Hanan Hindy"],"pdf_url":"https://arxiv.org/pdf/2409.08618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08613v1","updated":"2024-09-13T07:59:15Z","published":"2024-09-13T07:59:15Z","title":"Dense Point Clouds Matter: Dust-GS for Scene Reconstruction from Sparse\n Viewpoints","summary":" 3D Gaussian Splatting (3DGS) has demonstrated remarkable performance in scene\nsynthesis and novel view synthesis tasks. Typically, the initialization of 3D\nGaussian primitives relies on point clouds derived from Structure-from-Motion\n(SfM) methods. However, in scenarios requiring scene reconstruction from sparse\nviewpoints, the effectiveness of 3DGS is significantly constrained by the\nquality of these initial point clouds and the limited number of input images.\nIn this study, we present Dust-GS, a novel framework specifically designed to\novercome the limitations of 3DGS in sparse viewpoint conditions. Instead of\nrelying solely on SfM, Dust-GS introduces an innovative point cloud\ninitialization technique that remains effective even with sparse input data.\nOur approach leverages a hybrid strategy that integrates an adaptive\ndepth-based masking technique, thereby enhancing the accuracy and detail of\nreconstructed scenes. Extensive experiments conducted on several benchmark\ndatasets demonstrate that Dust-GS surpasses traditional 3DGS methods in\nscenarios with sparse viewpoints, achieving superior scene reconstruction\nquality with a reduced number of input images.\n","authors":["Shan Chen","Jiale Zhou","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2409.08613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12405v3","updated":"2024-09-13T07:32:34Z","published":"2024-07-17T08:32:14Z","title":"Fisheye-Calib-Adapter: An Easy Tool for Fisheye Camera Model Conversion","summary":" The increasing necessity for fisheye cameras in fields such as robotics and\nautonomous driving has led to the proposal of various fisheye camera models.\nWhile the evolution of camera models has facilitated the development of diverse\nsystems in the field, the lack of adaptation between different fisheye camera\nmodels means that recalibration is always necessary, which is cumbersome. This\npaper introduces a conversion tool for various previously proposed fisheye\ncamera models. It is user-friendly, simple, yet extremely fast and accurate,\noffering conversion capabilities for a broader range of models compared to\nexisting tools. We have verified that models converted using our system perform\ncorrectly in applications such as SLAM. By utilizing our system, researchers\ncan obtain output parameters directly from input parameters without the need\nfor an image set and any recalibration processes, thus serving as a bridge\nacross different fisheye camera models in various research fields. We provide\nour system as an open source tool available at:\nhttps://github.com/eowjd0512/fisheye-calib-adapter\n","authors":["Sangjun Lee"],"pdf_url":"https://arxiv.org/pdf/2407.12405v3.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.08598v1","updated":"2024-09-13T07:28:57Z","published":"2024-09-13T07:28:57Z","title":"Knowledge-Enhanced Facial Expression Recognition with\n Emotional-to-Neutral Transformation","summary":" Existing facial expression recognition (FER) methods typically fine-tune a\npre-trained visual encoder using discrete labels. However, this form of\nsupervision limits to specify the emotional concept of different facial\nexpressions. In this paper, we observe that the rich knowledge in text\nembeddings, generated by vision-language models, is a promising alternative for\nlearning discriminative facial expression representations. Inspired by this, we\npropose a novel knowledge-enhanced FER method with an emotional-to-neutral\ntransformation. Specifically, we formulate the FER problem as a process to\nmatch the similarity between a facial expression representation and text\nembeddings. Then, we transform the facial expression representation to a\nneutral representation by simulating the difference in text embeddings from\ntextual facial expression to textual neutral. Finally, a self-contrast\nobjective is introduced to pull the facial expression representation closer to\nthe textual facial expression, while pushing it farther from the neutral\nrepresentation. We conduct evaluation with diverse pre-trained visual encoders\nincluding ResNet-18 and Swin-T on four challenging facial expression datasets.\nExtensive experiments demonstrate that our method significantly outperforms\nstate-of-the-art FER methods. The code will be publicly available.\n","authors":["Hangyu Li","Yihan Xu","Jiangchao Yao","Nannan Wang","Xinbo Gao","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2409.08598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08588v1","updated":"2024-09-13T07:08:48Z","published":"2024-09-13T07:08:48Z","title":"Improved Unet model for brain tumor image segmentation based on\n ASPP-coordinate attention mechanism","summary":" In this paper, we propose an improved Unet model for brain tumor image\nsegmentation, which combines coordinate attention mechanism and ASPP module to\nimprove the segmentation effect. After the data set is divided, we do the\nnecessary preprocessing to the image and use the improved model to experiment.\nFirst, we trained and validated the traditional Unet model. By analyzing the\nloss curve of the training set and the validation set, we can see that the loss\nvalue continues to decline at the first epoch and becomes stable at the eighth\nepoch. This process shows that the model constantly optimizes its parameters to\nimprove performance. At the same time, the change in the miou (mean\nIntersection over Union) index shows that the miou value exceeded 0.6 at the\n15th epoch, remained above 0.6 thereafter, and reached above 0.7 at the 46th\nepoch. These results indicate that the basic Unet model is effective in brain\ntumor image segmentation. Next, we introduce an improved Unet algorithm based\non coordinate attention mechanism and ASPP module for experiments. By observing\nthe loss change curves of the training set and the verification set, it is\nfound that the loss value reaches the lowest point at the sixth epoch and then\nremains relatively stable. At the same time, the miou indicator has stabilized\nabove 0.7 since the 20th epoch and has reached a maximum of 0.76. These results\nshow that the new mechanism introduced significantly improves the segmentation\nability of the model. Finally, we apply the trained traditional Unet model and\nthe improved Unet model based on the coordinate attention mechanism and ASPP\nmodule to the test set for brain tumor image segmentation prediction. Compared\nto the traditional Unet, the enhanced model offers superior segmentation and\nedge accuracy, providing a more reliable method for medical image analysis with\nthe coordinate attention mechanism and ASPP module.\n","authors":["Zixuan Wang","Yanlin Chen","Feiyang Wang","Qiaozhi Bao"],"pdf_url":"https://arxiv.org/pdf/2409.08588v1.pdf","comment":"5 pages, 8 figures, accepted by ICBASE 2024"},{"id":"http://arxiv.org/abs/2211.10881v3","updated":"2024-09-13T07:06:19Z","published":"2022-11-20T06:31:23Z","title":"Deepfake Detection: A Comprehensive Survey from the Reliability\n Perspective","summary":" The mushroomed Deepfake synthetic materials circulated on the internet have\nraised a profound social impact on politicians, celebrities, and individuals\nworldwide. In this survey, we provide a thorough review of the existing\nDeepfake detection studies from the reliability perspective. We identify three\nreliability-oriented research challenges in the current Deepfake detection\ndomain: transferability, interpretability, and robustness. Moreover, while\nsolutions have been frequently addressed regarding the three challenges, the\ngeneral reliability of a detection model has been barely considered, leading to\nthe lack of reliable evidence in real-life usages and even for prosecutions on\nDeepfake-related cases in court. We, therefore, introduce a model reliability\nstudy metric using statistical random sampling knowledge and the publicly\navailable benchmark datasets to review the reliability of the existing\ndetection models on arbitrary Deepfake candidate suspects. Case studies are\nfurther executed to justify the real-life Deepfake cases including different\ngroups of victims with the help of the reliably qualified detection models as\nreviewed in this survey. Reviews and experiments on the existing approaches\nprovide informative discussions and future research directions for Deepfake\ndetection.\n","authors":["Tianyi Wang","Xin Liao","Kam Pui Chow","Xiaodong Lin","Yinglong Wang"],"pdf_url":"https://arxiv.org/pdf/2211.10881v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08585v1","updated":"2024-09-13T07:04:05Z","published":"2024-09-13T07:04:05Z","title":"Optimizing 4D Lookup Table for Low-light Video Enhancement via Wavelet\n Priori","summary":" Low-light video enhancement is highly demanding in maintaining spatiotemporal\ncolor consistency. Therefore, improving the accuracy of color mapping and\nkeeping the latency low is challenging. Based on this, we propose incorporating\nWavelet-priori for 4D Lookup Table (WaveLUT), which effectively enhances the\ncolor coherence between video frames and the accuracy of color mapping while\nmaintaining low latency. Specifically, we use the wavelet low-frequency domain\nto construct an optimized lookup prior and achieve an adaptive enhancement\neffect through a designed Wavelet-prior 4D lookup table. To effectively\ncompensate the a priori loss in the low light region, we further explore a\ndynamic fusion strategy that adaptively determines the spatial weights based on\nthe correlation between the wavelet lighting prior and the target intensity\nstructure. In addition, during the training phase, we devise a text-driven\nappearance reconstruction method that dynamically balances brightness and\ncontent through multimodal semantics-driven Fourier spectra. Extensive\nexperiments on a wide range of benchmark datasets show that this method\neffectively enhances the previous method's ability to perceive the color space\nand achieves metric-favorable and perceptually oriented real-time enhancement\nwhile maintaining high efficiency.\n","authors":["Jinhong He","Minglong Xue","Wenhai Wang","Mingliang Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.08585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05402v2","updated":"2024-09-13T07:01:43Z","published":"2024-03-08T15:58:00Z","title":"DualBEV: Unifying Dual View Transformation with Probabilistic\n Correspondences","summary":" Camera-based Bird's-Eye-View (BEV) perception often struggles between\nadopting 3D-to-2D or 2D-to-3D view transformation (VT). The 3D-to-2D VT\ntypically employs resource-intensive Transformer to establish robust\ncorrespondences between 3D and 2D features, while the 2D-to-3D VT utilizes the\nLift-Splat-Shoot (LSS) pipeline for real-time application, potentially missing\ndistant information. To address these limitations, we propose DualBEV, a\nunified framework that utilizes a shared feature transformation incorporating\nthree probabilistic measurements for both strategies. By considering dual-view\ncorrespondences in one stage, DualBEV effectively bridges the gap between these\nstrategies, harnessing their individual strengths. Our method achieves\nstate-of-the-art performance without Transformer, delivering comparable\nefficiency to the LSS approach, with 55.2% mAP and 63.4% NDS on the nuScenes\ntest set. Code is available at \\url{https://github.com/PeidongLi/DualBEV}\n","authors":["Peidong Li","Wancheng Shen","Qihao Huang","Dixiao Cui"],"pdf_url":"https://arxiv.org/pdf/2403.05402v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2409.08582v1","updated":"2024-09-13T07:00:44Z","published":"2024-09-13T07:00:44Z","title":"ChangeChat: An Interactive Model for Remote Sensing Change Analysis via\n Multimodal Instruction Tuning","summary":" Remote sensing (RS) change analysis is vital for monitoring Earth's dynamic\nprocesses by detecting alterations in images over time. Traditional change\ndetection excels at identifying pixel-level changes but lacks the ability to\ncontextualize these alterations. While recent advancements in change captioning\noffer natural language descriptions of changes, they do not support\ninteractive, user-specific queries. To address these limitations, we introduce\nChangeChat, the first bitemporal vision-language model (VLM) designed\nspecifically for RS change analysis. ChangeChat utilizes multimodal instruction\ntuning, allowing it to handle complex queries such as change captioning,\ncategory-specific quantification, and change localization. To enhance the\nmodel's performance, we developed the ChangeChat-87k dataset, which was\ngenerated using a combination of rule-based methods and GPT-assisted\ntechniques. Experiments show that ChangeChat offers a comprehensive,\ninteractive solution for RS change analysis, achieving performance comparable\nto or even better than state-of-the-art (SOTA) methods on specific tasks, and\nsignificantly surpassing the latest general-domain model, GPT-4. Code and\npre-trained weights are available at https://github.com/hanlinwu/ChangeChat.\n","authors":["Pei Deng","Wenqian Zhou","Hanlin Wu"],"pdf_url":"https://arxiv.org/pdf/2409.08582v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.08573v1","updated":"2024-09-13T06:46:23Z","published":"2024-09-13T06:46:23Z","title":"HTR-VT: Handwritten Text Recognition with Vision Transformer","summary":" We explore the application of Vision Transformer (ViT) for handwritten text\nrecognition. The limited availability of labeled data in this domain poses\nchallenges for achieving high performance solely relying on ViT. Previous\ntransformer-based models required external data or extensive pre-training on\nlarge datasets to excel. To address this limitation, we introduce a\ndata-efficient ViT method that uses only the encoder of the standard\ntransformer. We find that incorporating a Convolutional Neural Network (CNN)\nfor feature extraction instead of the original patch embedding and employ\nSharpness-Aware Minimization (SAM) optimizer to ensure that the model can\nconverge towards flatter minima and yield notable enhancements. Furthermore,\nour introduction of the span mask technique, which masks interconnected\nfeatures in the feature map, acts as an effective regularizer. Empirically, our\napproach competes favorably with traditional CNN-based models on small datasets\nlike IAM and READ2016. Additionally, it establishes a new benchmark on the LAM\ndataset, currently the largest dataset with 19,830 training text lines. The\ncode is publicly available at: https://github.com/YutingLi0606/HTR-VT.\n","authors":["Yuting Li","Dexiong Chen","Tinglong Tang","Xi Shen"],"pdf_url":"https://arxiv.org/pdf/2409.08573v1.pdf","comment":"Accepted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2409.08572v1","updated":"2024-09-13T06:45:23Z","published":"2024-09-13T06:45:23Z","title":"DiffFAS: Face Anti-Spoofing via Generative Diffusion Models","summary":" Face anti-spoofing (FAS) plays a vital role in preventing face recognition\n(FR) systems from presentation attacks. Nowadays, FAS systems face the\nchallenge of domain shift, impacting the generalization performance of existing\nFAS methods. In this paper, we rethink about the inherence of domain shift and\ndeconstruct it into two factors: image style and image quality. Quality\ninfluences the purity of the presentation of spoof information, while style\naffects the manner in which spoof information is presented. Based on our\nanalysis, we propose DiffFAS framework, which quantifies quality as prior\ninformation input into the network to counter image quality shift, and performs\ndiffusion-based high-fidelity cross-domain and cross-attack types generation to\ncounter image style shift. DiffFAS transforms easily collectible live faces\ninto high-fidelity attack faces with precise labels while maintaining\nconsistency between live and spoof face identities, which can also alleviate\nthe scarcity of labeled data with novel type attacks faced by nowadays FAS\nsystem. We demonstrate the effectiveness of our framework on challenging\ncross-domain and cross-attack FAS datasets, achieving the state-of-the-art\nperformance. Available at https://github.com/murphytju/DiffFAS.\n","authors":["Xinxu Ge","Xin Liu","Zitong Yu","Jingang Shi","Chun Qi","Jie Li","Heikki Kälviäinen"],"pdf_url":"https://arxiv.org/pdf/2409.08572v1.pdf","comment":"ECCV 24"},{"id":"http://arxiv.org/abs/2409.08566v1","updated":"2024-09-13T06:36:31Z","published":"2024-09-13T06:36:31Z","title":"Hybrid-TTA: Continual Test-time Adaptation via Dynamic Domain Shift\n Detection","summary":" Continual Test Time Adaptation (CTTA) has emerged as a critical approach for\nbridging the domain gap between the controlled training environments and the\nreal-world scenarios, enhancing model adaptability and robustness. Existing\nCTTA methods, typically categorized into Full-Tuning (FT) and Efficient-Tuning\n(ET), struggle with effectively addressing domain shifts. To overcome these\nchallenges, we propose Hybrid-TTA, a holistic approach that dynamically selects\ninstance-wise tuning method for optimal adaptation. Our approach introduces the\nDynamic Domain Shift Detection (DDSD) strategy, which identifies domain shifts\nby leveraging temporal correlations in input sequences and dynamically switches\nbetween FT and ET to adapt to varying domain shifts effectively. Additionally,\nthe Masked Image Modeling based Adaptation (MIMA) framework is integrated to\nensure domain-agnostic robustness with minimal computational overhead. Our\nHybrid-TTA achieves a notable 1.6%p improvement in mIoU on the\nCityscapes-to-ACDC benchmark dataset, surpassing previous state-of-the-art\nmethods and offering a robust solution for real-world continual adaptation\nchallenges.\n","authors":["Hyewon Park","Hyejin Park","Jueun Ko","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.08566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08563v1","updated":"2024-09-13T06:33:41Z","published":"2024-09-13T06:33:41Z","title":"Second-order difference subspace","summary":" Subspace representation is a fundamental technique in various fields of\nmachine learning. Analyzing a geometrical relationship among multiple subspaces\nis essential for understanding subspace series' temporal and/or spatial\ndynamics. This paper proposes the second-order difference subspace, a\nhigher-order extension of the first-order difference subspace between two\nsubspaces that can analyze the geometrical difference between them. As a\npreliminary for that, we extend the definition of the first-order difference\nsubspace to the more general setting that two subspaces with different\ndimensions have an intersection. We then define the second-order difference\nsubspace by combining the concept of first-order difference subspace and\nprincipal component subspace (Karcher mean) between two subspaces, motivated by\nthe second-order central difference method. We can understand that the\nfirst/second-order difference subspaces correspond to the velocity and\nacceleration of subspace dynamics from the viewpoint of a geodesic on a\nGrassmann manifold. We demonstrate the validity and naturalness of our\nsecond-order difference subspace by showing numerical results on two\napplications: temporal shape analysis of a 3D object and time series analysis\nof a biometric signal.\n","authors":["Kazuhiro Fukui","Pedro H. V. Valois","Lincon Souza","Takumi Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2409.08563v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2409.08562v1","updated":"2024-09-13T06:29:45Z","published":"2024-09-13T06:29:45Z","title":"CSS: Overcoming Pose and Scene Challenges in Crowd-Sourced 3D Gaussian\n Splatting","summary":" We introduce Crowd-Sourced Splatting (CSS), a novel 3D Gaussian Splatting\n(3DGS) pipeline designed to overcome the challenges of pose-free scene\nreconstruction using crowd-sourced imagery. The dream of reconstructing\nhistorically significant but inaccessible scenes from collections of\nphotographs has long captivated researchers. However, traditional 3D techniques\nstruggle with missing camera poses, limited viewpoints, and inconsistent\nlighting. CSS addresses these challenges through robust geometric priors and\nadvanced illumination modeling, enabling high-quality novel view synthesis\nunder complex, real-world conditions. Our method demonstrates clear\nimprovements over existing approaches, paving the way for more accurate and\nflexible applications in AR, VR, and large-scale 3D reconstruction.\n","authors":["Runze Chen","Mingyu Xiao","Haiyong Luo","Fang Zhao","Fan Wu","Hao Xiong","Qi Liu","Meng Song"],"pdf_url":"https://arxiv.org/pdf/2409.08562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08557v1","updated":"2024-09-13T06:20:21Z","published":"2024-09-13T06:20:21Z","title":"DICS: Find Domain-Invariant and Class-Specific Features for\n Out-of-Distribution Generalization","summary":" While deep neural networks have made remarkable progress in various vision\ntasks, their performance typically deteriorates when tested in\nout-of-distribution (OOD) scenarios. Many OOD methods focus on extracting\ndomain-invariant features but neglect whether these features are unique to each\nclass. Even if some features are domain-invariant, they cannot serve as key\nclassification criteria if shared across different classes. In OOD tasks, both\ndomain-related and class-shared features act as confounders that hinder\ngeneralization. In this paper, we propose a DICS model to extract\nDomain-Invariant and Class-Specific features, including Domain Invariance\nTesting (DIT) and Class Specificity Testing (CST), which mitigate the effects\nof spurious correlations introduced by confounders. DIT learns domain-related\nfeatures of each source domain and removes them from inputs to isolate\ndomain-invariant class-related features. DIT ensures domain invariance by\naligning same-class features across different domains. Then, CST calculates\nsoft labels for those features by comparing them with features learned in\nprevious steps. We optimize the cross-entropy between the soft labels and their\ntrue labels, which enhances same-class similarity and different-class\ndistinctiveness, thereby reinforcing class specificity. Extensive experiments\non widely-used benchmarks demonstrate the effectiveness of our proposed\nalgorithm. Additional visualizations further demonstrate that DICS effectively\nidentifies the key features of each class in target domains.\n","authors":["Qiaowei Miao","Yawei Luo","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.08557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11447v2","updated":"2024-09-13T06:09:25Z","published":"2024-08-21T09:06:30Z","title":"GaussianOcc: Fully Self-supervised and Efficient 3D Occupancy Estimation\n with Gaussian Splatting","summary":" We introduce GaussianOcc, a systematic method that investigates the two\nusages of Gaussian splatting for fully self-supervised and efficient 3D\noccupancy estimation in surround views. First, traditional methods for\nself-supervised 3D occupancy estimation still require ground truth 6D poses\nfrom sensors during training. To address this limitation, we propose Gaussian\nSplatting for Projection (GSP) module to provide accurate scale information for\nfully self-supervised training from adjacent view projection. Additionally,\nexisting methods rely on volume rendering for final 3D voxel representation\nlearning using 2D signals (depth maps, semantic maps), which is both\ntime-consuming and less effective. We propose Gaussian Splatting from Voxel\nspace (GSV) to leverage the fast rendering properties of Gaussian splatting. As\na result, the proposed GaussianOcc method enables fully self-supervised (no\nground truth pose) 3D occupancy estimation in competitive performance with low\ncomputational cost (2.7 times faster in training and 5 times faster in\nrendering). The relevant code will be available in\nhttps://github.com/GANWANSHUI/GaussianOcc.git.\n","authors":["Wanshui Gan","Fang Liu","Hongbin Xu","Ningkai Mo","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2408.11447v2.pdf","comment":"Project page: https://ganwanshui.github.io/GaussianOcc/"},{"id":"http://arxiv.org/abs/2403.12839v2","updated":"2024-09-13T05:44:20Z","published":"2024-03-19T15:45:54Z","title":"Global-guided Focal Neural Radiance Field for Large-scale Scene\n Rendering","summary":" Neural radiance fields~(NeRF) have recently been applied to render\nlarge-scale scenes. However, their limited model capacity typically results in\nblurred rendering results. Existing large-scale NeRFs primarily address this\nlimitation by partitioning the scene into blocks, which are subsequently\nhandled by separate sub-NeRFs. These sub-NeRFs, trained from scratch and\nprocessed independently, lead to inconsistencies in geometry and appearance\nacross the scene. Consequently, the rendering quality fails to exhibit\nsignificant improvement despite the expansion of model capacity. In this work,\nwe present global-guided focal neural radiance field (GF-NeRF) that achieves\nhigh-fidelity rendering of large-scale scenes. Our proposed GF-NeRF utilizes a\ntwo-stage (Global and Focal) architecture and a global-guided training\nstrategy. The global stage obtains a continuous representation of the entire\nscene while the focal stage decomposes the scene into multiple blocks and\nfurther processes them with distinct sub-encoders. Leveraging this two-stage\narchitecture, sub-encoders only need fine-tuning based on the global encoder,\nthus reducing training complexity in the focal stage while maintaining\nscene-wide consistency. Spatial information and error information from the\nglobal stage also benefit the sub-encoders to focus on crucial areas and\neffectively capture more details of large-scale scenes. Notably, our approach\ndoes not rely on any prior knowledge about the target scene, attributing\nGF-NeRF adaptable to various large-scale scene types, including street-view and\naerial-view scenes. We demonstrate that our method achieves high-fidelity,\nnatural rendering results on various types of large-scale datasets. Our project\npage: https://shaomq2187.github.io/GF-NeRF/\n","authors":["Mingqi Shao","Feng Xiong","Hang Zhang","Shuang Yang","Mu Xu","Wei Bian","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.12839v2.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2407.12632v2","updated":"2024-09-13T05:43:14Z","published":"2024-07-17T15:00:35Z","title":"CerberusDet: Unified Multi-Dataset Object Detection","summary":" Conventional object detection models are usually limited by the data on which\nthey were trained and by the category logic they define. With the recent rise\nof Language-Visual Models, new methods have emerged that are not restricted to\nthese fixed categories. Despite their flexibility, such Open Vocabulary\ndetection models still fall short in accuracy compared to traditional models\nwith fixed classes. At the same time, more accurate data-specific models face\nchallenges when there is a need to extend classes or merge different datasets\nfor training. The latter often cannot be combined due to different logics or\nconflicting class definitions, making it difficult to improve a model without\ncompromising its performance. In this paper, we introduce CerberusDet, a\nframework with a multi-headed model designed for handling multiple object\ndetection tasks. Proposed model is built on the YOLO architecture and\nefficiently shares visual features from both backbone and neck components,\nwhile maintaining separate task heads. This approach allows CerberusDet to\nperform very efficiently while still delivering optimal results. We evaluated\nthe model on the PASCAL VOC dataset and Objects365 dataset to demonstrate its\nabilities. CerberusDet achieved state-of-the-art results with 36% less\ninference time. The more tasks are trained together, the more efficient the\nproposed model becomes compared to running individual models sequentially. The\ntraining and inference code, as well as the model, are available as open-source\n(https://github.com/ai-forever/CerberusDet).\n","authors":["Irina Tolstykh","Mikhail Chernyshov","Maksim Kuprashevich"],"pdf_url":"https://arxiv.org/pdf/2407.12632v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.07825v2","updated":"2024-09-13T05:33:47Z","published":"2024-09-12T08:15:39Z","title":"A Comprehensive Survey on Deep Multimodal Learning with Missing Modality","summary":" During multimodal model training and reasoning, data samples may miss certain\nmodalities and lead to compromised model performance due to sensor limitations,\ncost constraints, privacy concerns, data loss, and temporal and spatial\nfactors. This survey provides an overview of recent progress in Multimodal\nLearning with Missing Modality (MLMM), focusing on deep learning techniques. It\nis the first comprehensive survey that covers the historical background and the\ndistinction between MLMM and standard multimodal learning setups, followed by a\ndetailed analysis of current MLMM methods, applications, and datasets,\nconcluding with a discussion about challenges and potential future directions\nin the field.\n","authors":["Renjie Wu","Hu Wang","Hsiang-Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2409.07825v2.pdf","comment":"Work in progress; open to discussion; planning to submit to ACM CSUR\n in September"},{"id":"http://arxiv.org/abs/2403.08557v4","updated":"2024-09-13T05:14:36Z","published":"2024-03-13T14:08:45Z","title":"OC4-ReID: Occluded Cloth-Changing Person Re-Identification","summary":" The study of Cloth-Changing Person Re-identification (CC-ReID) focuses on\nretrieving specific pedestrians when their clothing has changed, typically\nunder the assumption that the entire pedestrian images are visible. Pedestrian\nimages in real-world scenarios, however, are often partially obscured by\nobstacles, presenting a significant challenge to existing CC-ReID systems. In\nthis paper, we introduce a more challenging task termed Occluded Cloth-Changing\nPerson Re-Identification (OC4-ReID), which simultaneously addresses two\nchallenges of clothing changes and occlusion. Concretely, we construct two new\ndatasets, Occ-LTCC and Occ-PRCC, based on original CC-ReID datasets to include\nrandom occlusions of key pedestrians components (e.g., head, torso). Moreover,\na novel benchmark is proposed for OC4-ReID incorporating a Train-Test Micro\nGranularity Screening (T2MGS) module to mitigate the influence of occlusion and\nproposing a Part-Robust Triplet (PRT) loss for partial features learning.\nComprehensive experiments on the proposed datasets, as well as on two CC-ReID\nbenchmark datasets demonstrate the superior performance of proposed method\nagainst other state-of-the-art methods. The codes and datasets are available\nat: https://github.com/1024AILab/OC4-ReID.\n","authors":["Zhihao Chen","Yiyuan Ge","Ziyang Wang","Jiaju Kang","Mingya Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08557v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08537v1","updated":"2024-09-13T04:54:34Z","published":"2024-09-13T04:54:34Z","title":"SRE-CNN: A Spatiotemporal Rotation-Equivariant CNN for Cardiac Cine MR\n Imaging","summary":" Dynamic MR images possess various transformation symmetries,including the\nrotation symmetry of local features within the image and along the temporal\ndimension. Utilizing these symmetries as prior knowledge can facilitate dynamic\nMR imaging with high spatiotemporal resolution. Equivariant CNN is an effective\ntool to leverage the symmetry priors. However, current equivariant CNN methods\nfail to fully exploit these symmetry priors in dynamic MR imaging. In this\nwork, we propose a novel framework of Spatiotemporal Rotation-Equivariant CNN\n(SRE-CNN), spanning from the underlying high-precision filter design to the\nconstruction of the temporal-equivariant convolutional module and imaging\nmodel, to fully harness the rotation symmetries inherent in dynamic MR images.\nThe temporal-equivariant convolutional module enables exploitation the rotation\nsymmetries in both spatial and temporal dimensions, while the high-precision\nconvolutional filter, based on parametrization strategy, enhances the\nutilization of rotation symmetry of local features to improve the\nreconstruction of detailed anatomical structures. Experiments conducted on\nhighly undersampled dynamic cardiac cine data (up to 20X) have demonstrated the\nsuperior performance of our proposed approach, both quantitatively and\nqualitatively.\n","authors":["Yuliang Zhu","Jing Cheng","Zhuo-Xu Cui","Jianfeng Ren","Chengbo Wang","Dong Liang"],"pdf_url":"https://arxiv.org/pdf/2409.08537v1.pdf","comment":"Accepted at MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.15861v2","updated":"2024-09-13T04:11:30Z","published":"2024-07-10T13:50:31Z","title":"Adversarial Attacks and Defenses on Text-to-Image Diffusion Models: A\n Survey","summary":" Recently, the text-to-image diffusion model has gained considerable attention\nfrom the community due to its exceptional image generation capability. A\nrepresentative model, Stable Diffusion, amassed more than 10 million users\nwithin just two months of its release. This surge in popularity has facilitated\nstudies on the robustness and safety of the model, leading to the proposal of\nvarious adversarial attack methods. Simultaneously, there has been a marked\nincrease in research focused on defense methods to improve the robustness and\nsafety of these models. In this survey, we provide a comprehensive review of\nthe literature on adversarial attacks and defenses targeting text-to-image\ndiffusion models. We begin with an overview of text-to-image diffusion models,\nfollowed by an introduction to a taxonomy of adversarial attacks and an\nin-depth review of existing attack methods. We then present a detailed analysis\nof current defense methods that improve model robustness and safety. Finally,\nwe discuss ongoing challenges and explore promising future research directions.\nFor a complete list of the adversarial attack and defense methods covered in\nthis survey, please refer to our curated repository at\nhttps://github.com/datar001/Awesome-AD-on-T2IDM.\n","authors":["Chenyu Zhang","Mingwang Hu","Wenhui Li","Lanjun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.15861v2.pdf","comment":"Accepted for Information Fusion. Related benchmarks and codes are\n available at \\url{https://github.com/datar001/Awesome-AD-on-T2IDM}"},{"id":"http://arxiv.org/abs/2408.11559v2","updated":"2024-09-13T03:57:47Z","published":"2024-08-21T12:13:18Z","title":"Semi-supervised 3D Semantic Scene Completion with 2D Vision Foundation\n Model Guidance","summary":" Accurate prediction of 3D semantic occupancy from 2D visual images is vital\nin enabling autonomous agents to comprehend their surroundings for planning and\nnavigation. State-of-the-art methods typically employ fully supervised\napproaches, necessitating a huge labeled dataset acquired through expensive\nLiDAR sensors and meticulous voxel-wise labeling by human annotators. The\nresource-intensive nature of this annotating process significantly hampers the\napplication and scalability of these methods. We introduce a novel\nsemi-supervised framework to alleviate the dependency on densely annotated\ndata. Our approach leverages 2D foundation models to generate essential 3D\nscene geometric and semantic cues, facilitating a more efficient training\nprocess. Our framework exhibits notable properties: (1) Generalizability,\napplicable to various 3D semantic scene completion approaches, including 2D-3D\nlifting and 3D-2D transformer methods. (2) Effectiveness, as demonstrated\nthrough experiments on SemanticKITTI and NYUv2, wherein our method achieves up\nto 85% of the fully-supervised performance using only 10% labeled data. This\napproach not only reduces the cost and labor associated with data annotation\nbut also demonstrates the potential for broader adoption in camera-based\nsystems for 3D semantic occupancy prediction.\n","authors":["Duc-Hai Pham","Duc Dung Nguyen","Hoang-Anh Pham","Ho Lai Tuan","Phong Ha Nguyen","Khoi Nguyen","Rang Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.11559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08520v1","updated":"2024-09-13T03:40:58Z","published":"2024-09-13T03:40:58Z","title":"GroundingBooth: Grounding Text-to-Image Customization","summary":" Recent studies in text-to-image customization show great success in\ngenerating personalized object variants given several images of a subject.\nWhile existing methods focus more on preserving the identity of the subject,\nthey often fall short of controlling the spatial relationship between objects.\nIn this work, we introduce GroundingBooth, a framework that achieves zero-shot\ninstance-level spatial grounding on both foreground subjects and background\nobjects in the text-to-image customization task. Our proposed text-image\ngrounding module and masked cross-attention layer allow us to generate\npersonalized images with both accurate layout alignment and identity\npreservation while maintaining text-image coherence. With such layout control,\nour model inherently enables the customization of multiple subjects at once.\nOur model is evaluated on both layout-guided image synthesis and\nreference-based customization tasks, showing strong results compared to\nexisting methods. Our work is the first work to achieve a joint grounding of\nboth subject-driven foreground generation and text-driven background\ngeneration.\n","authors":["Zhexiao Xiong","Wei Xiong","Jing Shi","He Zhang","Yizhi Song","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2409.08520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08518v1","updated":"2024-09-13T03:34:37Z","published":"2024-09-13T03:34:37Z","title":"Anytime Continual Learning for Open Vocabulary Classification","summary":" We propose an approach for anytime continual learning (AnytimeCL) for open\nvocabulary image classification. The AnytimeCL problem aims to break away from\nbatch training and rigid models by requiring that a system can predict any set\nof labels at any time and efficiently update and improve when receiving one or\nmore training samples at any time. Despite the challenging goal, we achieve\nsubstantial improvements over recent methods. We propose a dynamic weighting\nbetween predictions of a partially fine-tuned model and a fixed open vocabulary\nmodel that enables continual improvement when training samples are available\nfor a subset of a task's labels. We also propose an attention-weighted PCA\ncompression of training features that reduces storage and computation with\nlittle impact to model accuracy. Our methods are validated with experiments\nthat test flexibility of learning and inference. Code is available at\nhttps://github.com/jessemelpolio/AnytimeCL.\n","authors":["Zhen Zhu","Yiming Gong","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2409.08518v1.pdf","comment":"To appear at ECCV 2024 as Oral presentation"},{"id":"http://arxiv.org/abs/2409.08516v1","updated":"2024-09-13T03:29:16Z","published":"2024-09-13T03:29:16Z","title":"AWF: Adaptive Weight Fusion for Enhanced Class Incremental Semantic\n Segmentation","summary":" Class Incremental Semantic Segmentation (CISS) aims to mitigate catastrophic\nforgetting by maintaining a balance between previously learned and newly\nintroduced knowledge. Existing methods, primarily based on regularization\ntechniques like knowledge distillation, help preserve old knowledge but often\nface challenges in effectively integrating new knowledge, resulting in limited\noverall improvement. Endpoints Weight Fusion (EWF) method, while simple,\neffectively addresses some of these limitations by dynamically fusing the model\nweights from previous steps with those from the current step, using a fusion\nparameter alpha determined by the relative number of previously known classes\nand newly introduced classes. However, the simplicity of the alpha calculation\nmay limit its ability to fully capture the complexities of different task\nscenarios, potentially leading to suboptimal fusion outcomes. In this paper, we\npropose an enhanced approach called Adaptive Weight Fusion (AWF), which\nintroduces an alternating training strategy for the fusion parameter, allowing\nfor more flexible and adaptive weight integration. AWF achieves superior\nperformance by better balancing the retention of old knowledge with the\nlearning of new classes, significantly improving results on benchmark CISS\ntasks compared to the original EWF. And our experiment code will be released on\nGithub.\n","authors":["Zechao Sun","Haolin Jin","Weitong Chen","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.08516v1.pdf","comment":"10 pages,6 figures"},{"id":"http://arxiv.org/abs/2409.08513v1","updated":"2024-09-13T03:23:52Z","published":"2024-09-13T03:23:52Z","title":"Mamba-YOLO-World: Marrying YOLO-World with Mamba for Open-Vocabulary\n Detection","summary":" Open-vocabulary detection (OVD) aims to detect objects beyond a predefined\nset of categories. As a pioneering model incorporating the YOLO series into\nOVD, YOLO-World is well-suited for scenarios prioritizing speed and\nefficiency.However, its performance is hindered by its neck feature fusion\nmechanism, which causes the quadratic complexity and the limited guided\nreceptive fields.To address these limitations, we present Mamba-YOLO-World, a\nnovel YOLO-based OVD model employing the proposed MambaFusion Path Aggregation\nNetwork (MambaFusion-PAN) as its neck architecture. Specifically, we introduce\nan innovative State Space Model-based feature fusion mechanism consisting of a\nParallel-Guided Selective Scan algorithm and a Serial-Guided Selective Scan\nalgorithm with linear complexity and globally guided receptive fields. It\nleverages multi-modal input sequences and mamba hidden states to guide the\nselective scanning process.Experiments demonstrate that our model outperforms\nthe original YOLO-World on the COCO and LVIS benchmarks in both zero-shot and\nfine-tuning settings while maintaining comparable parameters and FLOPs.\nAdditionally, it surpasses existing state-of-the-art OVD methods with fewer\nparameters and FLOPs.\n","authors":["Haoxuan Wang","Qingdong He","Jinlong Peng","Hao Yang","Mingmin Chi","Yabiao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08513v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2409.08510v1","updated":"2024-09-13T03:20:38Z","published":"2024-09-13T03:20:38Z","title":"CasDyF-Net: Image Dehazing via Cascaded Dynamic Filters","summary":" Image dehazing aims to restore image clarity and visual quality by reducing\natmospheric scattering and absorption effects. While deep learning has made\nsignificant strides in this area, more and more methods are constrained by\nnetwork depth. Consequently, lots of approaches have adopted parallel branching\nstrategies. however, they often prioritize aspects such as resolution,\nreceptive field, or frequency domain segmentation without dynamically\npartitioning branches based on the distribution of input features. Inspired by\ndynamic filtering, we propose using cascaded dynamic filters to create a\nmulti-branch network by dynamically generating filter kernels based on feature\nmap distribution. To better handle branch features, we propose a residual\nmultiscale block (RMB), combining different receptive fields. Furthermore, we\nalso introduce a dynamic convolution-based local fusion method to merge\nfeatures from adjacent branches. Experiments on RESIDE, Haze4K, and O-Haze\ndatasets validate our method's effectiveness, with our model achieving a PSNR\nof 43.21dB on the RESIDE-Indoor dataset. The code is available at\nhttps://github.com/dauing/CasDyF-Net.\n","authors":["Wang Yinglong","He Bin"],"pdf_url":"https://arxiv.org/pdf/2409.08510v1.pdf","comment":"9 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.08509v1","updated":"2024-09-13T03:12:58Z","published":"2024-09-13T03:12:58Z","title":"Exploiting Supervised Poison Vulnerability to Strengthen Self-Supervised\n Defense","summary":" Availability poisons exploit supervised learning (SL) algorithms by\nintroducing class-related shortcut features in images such that models trained\non poisoned data are useless for real-world datasets. Self-supervised learning\n(SSL), which utilizes augmentations to learn instance discrimination, is\nregarded as a strong defense against poisoned data. However, by extending the\nstudy of SSL across multiple poisons on the CIFAR-10 and ImageNet-100 datasets,\nwe demonstrate that it often performs poorly, far below that of training on\nclean data. Leveraging the vulnerability of SL to poison attacks, we introduce\nadversarial training (AT) on SL to obfuscate poison features and guide robust\nfeature learning for SSL. Our proposed defense, designated VESPR (Vulnerability\nExploitation of Supervised Poisoning for Robust SSL), surpasses the performance\nof six previous defenses across seven popular availability poisons. VESPR\ndisplays superior performance over all previous defenses, boosting the minimum\nand average ImageNet-100 test accuracies of poisoned models by 16% and 9%,\nrespectively. Through analysis and ablation studies, we elucidate the\nmechanisms by which VESPR learns robust class features.\n","authors":["Jeremy Styborski","Mingzhi Lyu","Yi Huang","Adams Kong"],"pdf_url":"https://arxiv.org/pdf/2409.08509v1.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.08508v1","updated":"2024-09-13T03:12:10Z","published":"2024-09-13T03:12:10Z","title":"Identifying Human Indoor Daily Life Behavior employing Thermal Sensor\n Arrays (TSAs)","summary":" Daily activity monitoring systems used in households provide vital\ninformation for health status, particularly with aging residents. Multiple\napproaches have been introduced to achieve such goals, typically obtrusive and\nnon-obtrusive. Amongst the obtrusive approaches are the wearable devices, and\namong the non-obtrusive approaches are the movement detection systems,\nincluding motion sensors and thermal sensor arrays (TSAs). TSA systems are\nadvantageous when preserving a person's privacy and picking his precise spatial\nlocation. In this study, human daily living activities were monitored day and\nnight, constructing the corresponding activity time series and spatial\nprobability distribution and employing a TSA system. The monitored activities\nare classified into two categories: sleeping and daily activity. Results showed\nthe possibility of distinguishing between classes regardless of day and night.\nThe obtained sleep activity duration was compared with previous research using\nthe same raw data. Results showed that the duration of sleep activity, on\naverage, was 9 hours/day, and daily life activity was 7 hours/day. The person's\nspatial probability distribution was determined using the bivariate\ndistribution for the monitored location. In conclusion, the results showed that\nsleeping activity was dominant. Our study showed that TSAs were the optimum\nchoice when monitoring human activity. Our proposed approach tackled\nlimitations encountered by previous human activity monitoring systems, such as\npreserving human privacy while knowing his precise spatial location.\n","authors":["Dina E. Abdelaleem","Hassan M. Ahmed","M. Sami Soliman","Tarek M. Said"],"pdf_url":"https://arxiv.org/pdf/2409.08508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03487v2","updated":"2024-09-13T03:03:16Z","published":"2024-09-05T12:52:24Z","title":"ScreenMark: Watermarking Arbitrary Visual Content on Screen","summary":" Digital watermarking has demonstrated its effectiveness in protecting\nmultimedia content. However, existing watermarking are predominantly tailored\nfor specific media types, rendering them less effective for the protection of\ncontent displayed on computer screens, which is often multimodal and dynamic.\nVisual Screen Content (VSC), is particularly susceptible to theft and leakage\nvia screenshots, a vulnerability that current watermarking methods fail to\nadequately address. To tackle these challenges, we propose ScreenMark, a robust\nand practical watermarking method designed specifically for arbitrary VSC\nprotection. ScreenMark utilizes a three-stage progressive watermarking\nframework. Initially, inspired by diffusion principles, we initialize the\nmutual transformation between regular watermark information and irregular\nwatermark patterns. Subsequently, these patterns are integrated with screen\ncontent using a pre-multiplication alpha blending technique, supported by a\npre-trained screen decoder for accurate watermark retrieval. The progressively\ncomplex distorter enhances the robustness of the watermark in real-world\nscreenshot scenarios. Finally, the model undergoes fine-tuning guided by a\njoint-level distorter to ensure optimal performance. To validate the\neffectiveness of ScreenMark, we compiled a dataset comprising 100,000\nscreenshots from various devices and resolutions. Extensive experiments across\ndifferent datasets confirm the method's superior robustness, imperceptibility,\nand practical applicability.\n","authors":["Xiujian Liang","Gaozhi Liu","Yichao Si","Xiaoxiao Hu","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08501v1","updated":"2024-09-13T02:52:25Z","published":"2024-09-13T02:52:25Z","title":"PSTNet: Enhanced Polyp Segmentation with Multi-scale Alignment and\n Frequency Domain Integration","summary":" Accurate segmentation of colorectal polyps in colonoscopy images is crucial\nfor effective diagnosis and management of colorectal cancer (CRC). However,\ncurrent deep learning-based methods primarily rely on fusing RGB information\nacross multiple scales, leading to limitations in accurately identifying polyps\ndue to restricted RGB domain information and challenges in feature misalignment\nduring multi-scale aggregation. To address these limitations, we propose the\nPolyp Segmentation Network with Shunted Transformer (PSTNet), a novel approach\nthat integrates both RGB and frequency domain cues present in the images.\nPSTNet comprises three key modules: the Frequency Characterization Attention\nModule (FCAM) for extracting frequency cues and capturing polyp\ncharacteristics, the Feature Supplementary Alignment Module (FSAM) for aligning\nsemantic information and reducing misalignment noise, and the Cross Perception\nlocalization Module (CPM) for synergizing frequency cues with high-level\nsemantics to achieve efficient polyp segmentation. Extensive experiments on\nchallenging datasets demonstrate PSTNet's significant improvement in polyp\nsegmentation accuracy across various metrics, consistently outperforming\nstate-of-the-art methods. The integration of frequency domain cues and the\nnovel architectural design of PSTNet contribute to advancing computer-assisted\npolyp segmentation, facilitating more accurate diagnosis and management of CRC.\n","authors":["Wenhao Xu","Rongtao Xu","Changwei Wang","Xiuli Li","Shibiao Xu","Li Guo"],"pdf_url":"https://arxiv.org/pdf/2409.08501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08500v1","updated":"2024-09-13T02:48:56Z","published":"2024-09-13T02:48:56Z","title":"Cross-conditioned Diffusion Model for Medical Image to Image Translation","summary":" Multi-modal magnetic resonance imaging (MRI) provides rich, complementary\ninformation for analyzing diseases. However, the practical challenges of\nacquiring multiple MRI modalities, such as cost, scan time, and safety\nconsiderations, often result in incomplete datasets. This affects both the\nquality of diagnosis and the performance of deep learning models trained on\nsuch data. Recent advancements in generative adversarial networks (GANs) and\ndenoising diffusion models have shown promise in natural and medical\nimage-to-image translation tasks. However, the complexity of training GANs and\nthe computational expense associated with diffusion models hinder their\ndevelopment and application in this task. To address these issues, we introduce\na Cross-conditioned Diffusion Model (CDM) for medical image-to-image\ntranslation. The core idea of CDM is to use the distribution of target\nmodalities as guidance to improve synthesis quality while achieving higher\ngeneration efficiency compared to conventional diffusion models. First, we\npropose a Modality-specific Representation Model (MRM) to model the\ndistribution of target modalities. Then, we design a Modality-decoupled\nDiffusion Network (MDN) to efficiently and effectively learn the distribution\nfrom MRM. Finally, a Cross-conditioned UNet (C-UNet) with a Condition Embedding\nmodule is designed to synthesize the target modalities with the source\nmodalities as input and the target distribution for guidance. Extensive\nexperiments conducted on the BraTS2023 and UPenn-GBM benchmark datasets\ndemonstrate the superiority of our method.\n","authors":["Zhaohu Xing","Sicheng Yang","Sixiang Chen","Tian Ye","Yijun Yang","Jing Qin","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.08500v1.pdf","comment":"miccai24"},{"id":"http://arxiv.org/abs/2405.01607v3","updated":"2024-09-13T02:42:45Z","published":"2024-05-02T04:53:42Z","title":"Wildfire Risk Prediction: A Review","summary":" Wildfires have significant impacts on global vegetation, wildlife, and\nhumans. They destroy plant communities and wildlife habitats and contribute to\nincreased emissions of carbon dioxide, nitrogen oxides, methane, and other\npollutants. The prediction of wildfires relies on various independent variables\ncombined with regression or machine learning methods. In this technical review,\nwe describe the options for independent variables, data processing techniques,\nmodels, independent variables collinearity and importance estimation methods,\nand model performance evaluation metrics. First, we divide the independent\nvariables into 4 aspects, including climate and meteorology conditions,\nsocio-economical factors, terrain and hydrological features, and wildfire\nhistorical records. Second, preprocessing methods are described for different\nmagnitudes, different spatial-temporal resolutions, and different formats of\ndata. Third, the collinearity and importance evaluation methods of independent\nvariables are also considered. Fourth, we discuss the application of\nstatistical models, traditional machine learning models, and deep learning\nmodels in wildfire risk prediction. In this subsection, compared with other\nreviews, this manuscript particularly discusses the evaluation metrics and\nrecent advancements in deep learning methods. Lastly, addressing the\nlimitations of current research, this paper emphasizes the need for more\neffective deep learning time series forecasting algorithms, the utilization of\nthree-dimensional data including ground and trunk fuel, extraction of more\naccurate historical fire point data, and improved model evaluation metrics.\n","authors":["Zhengsen Xu","Jonathan Li","Sibo Cheng","Xue Rui","Yu Zhao","Hongjie He","Linlin Xu"],"pdf_url":"https://arxiv.org/pdf/2405.01607v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08494v1","updated":"2024-09-13T02:41:49Z","published":"2024-09-13T02:41:49Z","title":"WheelPoser: Sparse-IMU Based Body Pose Estimation for Wheelchair Users","summary":" Despite researchers having extensively studied various ways to track body\npose on-the-go, most prior work does not take into account wheelchair users,\nleading to poor tracking performance. Wheelchair users could greatly benefit\nfrom this pose information to prevent injuries, monitor their health, identify\nenvironmental accessibility barriers, and interact with gaming and VR\nexperiences. In this work, we present WheelPoser, a real-time pose estimation\nsystem specifically designed for wheelchair users. Our system uses only four\nstrategically placed IMUs on the user's body and wheelchair, making it far more\npractical than prior systems using cameras and dense IMU arrays. WheelPoser is\nable to track a wheelchair user's pose with a mean joint angle error of 14.30\ndegrees and a mean joint position error of 6.74 cm, more than three times\nbetter than similar systems using sparse IMUs. To train our system, we collect\na novel WheelPoser-IMU dataset, consisting of 167 minutes of paired IMU sensor\nand motion capture data of people in wheelchairs, including wheelchair-specific\nmotions such as propulsion and pressure relief. Finally, we explore the\npotential application space enabled by our system and discuss future\nopportunities. Open-source code, models, and dataset can be found here:\nhttps://github.com/axle-lab/WheelPoser.\n","authors":["Yunzhi Li","Vimal Mollyn","Kuang Yuan","Patrick Carrington"],"pdf_url":"https://arxiv.org/pdf/2409.08494v1.pdf","comment":"Accepted by ASSETS 2024"},{"id":"http://arxiv.org/abs/2408.05008v2","updated":"2024-09-13T02:41:09Z","published":"2024-08-09T11:40:20Z","title":"FlowDreamer: exploring high fidelity text-to-3D generation via rectified\n flow","summary":" Recent advances in text-to-3D generation have made significant progress. In\nparticular, with the pretrained diffusion models, existing methods\npredominantly use Score Distillation Sampling (SDS) to train 3D models such as\nNeural Radiance Fields (NeRF) and 3D Gaussian Splatting (3D GS). However, a\nhurdle is that they often encounter difficulties with over-smoothing textures\nand over-saturating colors. The rectified flow model - which utilizes a simple\nordinary differential equation (ODE) to represent a linear trajectory - shows\npromise as an alternative prior to text-to-3D generation. It learns a\ntime-independent vector field, thereby reducing the ambiguity in 3D model\nupdate gradients that are calculated using time-dependent scores in the SDS\nframework. In light of this, we first develop a mathematical analysis to\nseamlessly integrate SDS with rectified flow model, paving the way for our\ninitial framework known as Vector Field Distillation Sampling (VFDS). However,\nempirical findings indicate that VFDS still results in over-smoothing outcomes.\nTherefore, we analyze the grounding reasons for such a failure from the\nperspective of ODE trajectories. On top, we propose a novel framework, named\nFlowDreamer, which yields high-fidelity results with richer textual details and\nfaster convergence. The key insight is to leverage the coupling and reversible\nproperties of the rectified flow model to search for the corresponding noise,\nrather than using randomly sampled noise as in VFDS. Accordingly, we introduce\na novel Unique Couple Matching (UCM) loss, which guides the 3D model to\noptimize along the same trajectory. Our FlowDreamer is superior in its\nflexibility to be applied to both NeRF and 3D GS. Extensive experiments\ndemonstrate the high-fidelity outcomes and accelerated convergence of\nFlowDreamer.\n","authors":["Hangyu Li","Xiangxiang Chu","Dingyuan Shi","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05008v2.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2409.08492v1","updated":"2024-09-13T02:37:13Z","published":"2024-09-13T02:37:13Z","title":"Tri-Plane Mamba: Efficiently Adapting Segment Anything Model for 3D\n Medical Images","summary":" General networks for 3D medical image segmentation have recently undergone\nextensive exploration. Behind the exceptional performance of these networks\nlies a significant demand for a large volume of pixel-level annotated data,\nwhich is time-consuming and labor-intensive. The emergence of the Segment\nAnything Model (SAM) has enabled this model to achieve superior performance in\n2D medical image segmentation tasks via parameter- and data-efficient feature\nadaptation. However, the introduction of additional depth channels in 3D\nmedical images not only prevents the sharing of 2D pre-trained features but\nalso results in a quadratic increase in the computational cost for adapting\nSAM. To overcome these challenges, we present the Tri-Plane Mamba (TP-Mamba)\nadapters tailored for the SAM, featuring two major innovations: 1) multi-scale\n3D convolutional adapters, optimized for efficiently processing local\ndepth-level information, 2) a tri-plane mamba module, engineered to capture\nlong-range depth-level representation without significantly increasing\ncomputational costs. This approach achieves state-of-the-art performance in 3D\nCT organ segmentation tasks. Remarkably, this superior performance is\nmaintained even with scarce training data. Specifically using only three CT\ntraining samples from the BTCV dataset, it surpasses conventional 3D\nsegmentation networks, attaining a Dice score that is up to 12% higher.\n","authors":["Hualiang Wang","Yiqun Lin","Xinpeng Ding","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2409.08492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10060v3","updated":"2024-09-13T02:33:22Z","published":"2024-08-19T14:54:12Z","title":"Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with\n Texture Map-Based Weak Supervision","summary":" Facial wrinkle detection plays a crucial role in cosmetic dermatology.\nPrecise manual segmentation of facial wrinkles is challenging and\ntime-consuming, with inherent subjectivity leading to inconsistent results\namong graders. To address this issue, we propose two solutions. First, we build\nand release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an\nextension of the NVIDIA FFHQ dataset. It includes 1,000 images with human\nlabels and 50,000 images with automatically generated weak labels. This dataset\ncould serve as a foundation for the research community to develop advanced\nwrinkle detection algorithms. Second, we introduce a simple training strategy\nutilizing texture maps, applicable to various segmentation models, to detect\nwrinkles across the face. Our two-stage training strategy first pretrain models\non a large dataset with weak labels (N=50k), or masked texture maps generated\nthrough computer vision techniques, without human intervention. We then\nfinetune the models using human-labeled data (N=1k), which consists of manually\nlabeled wrinkle masks. The network takes as input a combination of RGB and\nmasked texture map of the image, comprising four channels, in finetuning. We\neffectively combine labels from multiple annotators to minimize subjectivity in\nmanual labeling. Our strategies demonstrate improved segmentation performance\nin facial wrinkle segmentation both quantitatively and visually compared to\nexisting pretraining methods. The dataset is available at\nhttps://github.com/labhai/ffhq-wrinkle-dataset.\n","authors":["Junho Moon","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.10060v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08482v1","updated":"2024-09-13T02:13:26Z","published":"2024-09-13T02:13:26Z","title":"Risks When Sharing LoRA Fine-Tuned Diffusion Model Weights","summary":" With the emerging trend in generative models and convenient public access to\ndiffusion models pre-trained on large datasets, users can fine-tune these\nmodels to generate images of personal faces or items in new contexts described\nby natural language. Parameter efficient fine-tuning (PEFT) such as Low Rank\nAdaptation (LoRA) has become the most common way to save memory and computation\nusage on the user end during fine-tuning. However, a natural question is\nwhether the private images used for fine-tuning will be leaked to adversaries\nwhen sharing model weights. In this paper, we study the issue of privacy\nleakage of a fine-tuned diffusion model in a practical setting, where\nadversaries only have access to model weights, rather than prompts or images\nused for fine-tuning. We design and build a variational network autoencoder\nthat takes model weights as input and outputs the reconstruction of private\nimages. To improve the efficiency of training such an autoencoder, we propose a\ntraining paradigm with the help of timestep embedding. The results give a\nsurprising answer to this research question: an adversary can generate images\ncontaining the same identities as the private images. Furthermore, we\ndemonstrate that no existing defense method, including differential\nprivacy-based methods, can preserve the privacy of private data used for\nfine-tuning a diffusion model without compromising the utility of a fine-tuned\nmodel.\n","authors":["Dixi Yao"],"pdf_url":"https://arxiv.org/pdf/2409.08482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08481v1","updated":"2024-09-13T02:13:11Z","published":"2024-09-13T02:13:11Z","title":"USTC-TD: A Test Dataset and Benchmark for Image and Video Coding in\n 2020s","summary":" Image/video coding has been a remarkable research area for both academia and\nindustry for many years. Testing datasets, especially high-quality image/video\ndatasets are desirable for the justified evaluation of coding-related research,\npractical applications, and standardization activities. We put forward a test\ndataset namely USTC-TD, which has been successfully adopted in the practical\nend-to-end image/video coding challenge of the IEEE International Conference on\nVisual Communications and Image Processing in 2022 and 2023. USTC-TD contains\n40 images at 4K spatial resolution and 10 video sequences at 1080p spatial\nresolution, featuring various content due to the diverse environmental factors\n(scene type, texture, motion, view) and the designed imaging factors\n(illumination, shadow, lens). We quantitatively evaluate USTC-TD on different\nimage/video features (spatial, temporal, color, lightness), and compare it with\nthe previous image/video test datasets, which verifies the wider coverage and\nmore diversity of the proposed dataset. We also evaluate both classic\nstandardized and recent learned image/video coding schemes on USTC-TD with PSNR\nand MS-SSIM, and provide an extensive benchmark for the evaluated schemes.\nBased on the characteristics and specific design of the proposed test dataset,\nwe analyze the benchmark performance and shed light on the future research and\ndevelopment of image/video coding. All the data are released online:\nhttps://esakak.github.io/USTC-TD.\n","authors":["Zhuoyuan Li","Junqi Liao","Chuanbo Tang","Haotian Zhang","Yuqi Li","Yifan Bian","Xihua Sheng","Xinmin Feng","Yao Li","Changsheng Gao","Li Li","Dong Liu","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.08481v1.pdf","comment":"24 pages. Project Page: https://esakak.github.io/USTC-TD"},{"id":"http://arxiv.org/abs/2409.08475v1","updated":"2024-09-13T02:02:07Z","published":"2024-09-13T02:02:07Z","title":"RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense\n Positive Supervision","summary":" RT-DETR is the first real-time end-to-end transformer-based object detector.\nIts efficiency comes from the framework design and the Hungarian matching.\nHowever, compared to dense supervision detectors like the YOLO series, the\nHungarian matching provides much sparser supervision, leading to insufficient\nmodel training and difficult to achieve optimal results. To address these\nissues, we proposed a hierarchical dense positive supervision method based on\nRT-DETR, named RT-DETRv3. Firstly, we introduce a CNN-based auxiliary branch\nthat provides dense supervision that collaborates with the original decoder to\nenhance the encoder feature representation. Secondly, to address insufficient\ndecoder training, we propose a novel learning strategy involving self-attention\nperturbation. This strategy diversifies label assignment for positive samples\nacross multiple query groups, thereby enriching positive supervisions.\nAdditionally, we introduce a shared-weight decoder branch for dense positive\nsupervision to ensure more high-quality queries matching each ground truth.\nNotably, all aforementioned modules are training-only. We conduct extensive\nexperiments to demonstrate the effectiveness of our approach on COCO val2017.\nRT-DETRv3 significantly outperforms existing real-time detectors, including the\nRT-DETR series and the YOLO series. For example, RT-DETRv3-R18 achieves 48.1%\nAP (+1.6%/+1.4%) compared to RT-DETR-R18/RT-DETRv2-R18 while maintaining the\nsame latency. Meanwhile, it requires only half of epochs to attain a comparable\nperformance. Furthermore, RT-DETRv3-R101 can attain an impressive 54.6% AP\noutperforming YOLOv10-X. Code will be released soon.\n","authors":["Shuo Wang","Chunlong Xia","Feng Lv","Yifeng Shi"],"pdf_url":"https://arxiv.org/pdf/2409.08475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08474v1","updated":"2024-09-13T02:00:16Z","published":"2024-09-13T02:00:16Z","title":"Rethinking Meta-Learning from a Learning Lens","summary":" Meta-learning has emerged as a powerful approach for leveraging knowledge\nfrom previous tasks to solve new tasks. The mainstream methods focus on\ntraining a well-generalized model initialization, which is then adapted to\ndifferent tasks with limited data and updates. However, it pushes the model\noverfitting on the training tasks. Previous methods mainly attributed this to\nthe lack of data and used augmentations to address this issue, but they were\nlimited by sufficient training and effective augmentation strategies. In this\nwork, we focus on the more fundamental ``learning to learn'' strategy of\nmeta-learning to explore what causes errors and how to eliminate these errors\nwithout changing the environment. Specifically, we first rethink the\nalgorithmic procedure of meta-learning from a ``learning'' lens. Through\ntheoretical and empirical analyses, we find that (i) this paradigm faces the\nrisk of both overfitting and underfitting and (ii) the model adapted to\ndifferent tasks promote each other where the effect is stronger if the tasks\nare more similar. Based on this insight, we propose using task relations to\ncalibrate the optimization process of meta-learning and propose a plug-and-play\nmethod called Task Relation Learner (TRLearner) to achieve this goal.\nSpecifically, it first obtains task relation matrices from the extracted\ntask-specific meta-data. Then, it uses the obtained matrices with\nrelation-aware consistency regularization to guide optimization. Extensive\ntheoretical and empirical analyses demonstrate the effectiveness of TRLearner.\n","authors":["Jingyao Wang","Wenwen Qiang","Jiangmeng Li","Lingyu Si","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.08474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08468v1","updated":"2024-09-13T01:49:12Z","published":"2024-09-13T01:49:12Z","title":"Generalization Boosted Adapter for Open-Vocabulary Segmentation","summary":" Vision-language models (VLMs) have demonstrated remarkable open-vocabulary\nobject recognition capabilities, motivating their adaptation for dense\nprediction tasks like segmentation. However, directly applying VLMs to such\ntasks remains challenging due to their lack of pixel-level granularity and the\nlimited data available for fine-tuning, leading to overfitting and poor\ngeneralization. To address these limitations, we propose Generalization Boosted\nAdapter (GBA), a novel adapter strategy that enhances the generalization and\nrobustness of VLMs for open-vocabulary segmentation. GBA comprises two core\ncomponents: (1) a Style Diversification Adapter (SDA) that decouples features\ninto amplitude and phase components, operating solely on the amplitude to\nenrich the feature space representation while preserving semantic consistency;\nand (2) a Correlation Constraint Adapter (CCA) that employs cross-attention to\nestablish tighter semantic associations between text categories and target\nregions, suppressing irrelevant low-frequency ``noise'' information and\navoiding erroneous associations. Through the synergistic effect of the shallow\nSDA and the deep CCA, GBA effectively alleviates overfitting issues and\nenhances the semantic relevance of feature representations. As a simple,\nefficient, and plug-and-play component, GBA can be flexibly integrated into\nvarious CLIP-based methods, demonstrating broad applicability and achieving\nstate-of-the-art performance on multiple open-vocabulary segmentation\nbenchmarks.\n","authors":["Wenhao Xu","Changwei Wang","Xuxiang Feng","Rongtao Xu","Longzhao Huang","Zherui Zhang","Li Guo","Shibiao Xu"],"pdf_url":"https://arxiv.org/pdf/2409.08468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08464v1","updated":"2024-09-13T01:30:24Z","published":"2024-09-13T01:30:24Z","title":"VLTP: Vision-Language Guided Token Pruning for Task-Oriented\n Segmentation","summary":" Vision Transformers (ViTs) have emerged as the backbone of many segmentation\nmodels, consistently achieving state-of-the-art (SOTA) performance. However,\ntheir success comes at a significant computational cost. Image token pruning is\none of the most effective strategies to address this complexity. However,\nprevious approaches fall short when applied to more complex task-oriented\nsegmentation (TOS), where the class of each image patch is not predefined but\ndependent on the specific input task. This work introduces the Vision Language\nGuided Token Pruning (VLTP), a novel token pruning mechanism that can\naccelerate ViTbased segmentation models, particularly for TOS guided by\nmulti-modal large language model (MLLM). We argue that ViT does not need to\nprocess every image token through all of its layers only the tokens related to\nreasoning tasks are necessary. We design a new pruning decoder to take both\nimage tokens and vision-language guidance as input to predict the relevance of\neach image token to the task. Only image tokens with high relevance are passed\nto deeper layers of the ViT. Experiments show that the VLTP framework reduces\nthe computational costs of ViT by approximately 25% without performance\ndegradation and by around 40% with only a 1% performance drop.\n","authors":["Hanning Chen","Yang Ni","Wenjun Huang","Yezi Liu","SungHeon Jeong","Fei Wen","Nathaniel Bastian","Hugo Latapie","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2409.08464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08461v1","updated":"2024-09-13T01:19:53Z","published":"2024-09-13T01:19:53Z","title":"VistaFormer: Scalable Vision Transformers for Satellite Image Time\n Series Segmentation","summary":" We introduce VistaFormer, a lightweight Transformer-based model architecture\nfor the semantic segmentation of remote-sensing images. This model uses a\nmulti-scale Transformer-based encoder with a lightweight decoder that\naggregates global and local attention captured in the encoder blocks.\nVistaFormer uses position-free self-attention layers which simplifies the model\narchitecture and removes the need to interpolate temporal and spatial codes,\nwhich can reduce model performance when training and testing image resolutions\ndiffer. We investigate simple techniques for filtering noisy input signals like\nclouds and demonstrate that improved model scalability can be achieved by\nsubstituting Multi-Head Self-Attention (MHSA) with Neighbourhood Attention\n(NA). Experiments on the PASTIS and MTLCC crop-type segmentation benchmarks\nshow that VistaFormer achieves better performance than comparable models and\nrequires only 8% of the floating point operations using MHSA and 11% using NA\nwhile also using fewer trainable parameters. VistaFormer with MHSA improves on\nstate-of-the-art mIoU scores by 0.1% on the PASTIS benchmark and 3% on the\nMTLCC benchmark while VistaFormer with NA improves on the MTLCC benchmark by\n3.7%.\n","authors":["Ezra MacDonald","Derek Jacoby","Yvonne Coady"],"pdf_url":"https://arxiv.org/pdf/2409.08461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11256v3","updated":"2024-09-13T00:35:30Z","published":"2024-04-17T11:06:42Z","title":"MMCBE: Multi-modality Dataset for Crop Biomass Prediction and Beyond","summary":" Crop biomass, a critical indicator of plant growth, health, and productivity,\nis invaluable for crop breeding programs and agronomic research. However, the\naccurate and scalable quantification of crop biomass remains inaccessible due\nto limitations in existing measurement methods. One of the obstacles impeding\nthe advancement of current crop biomass prediction methodologies is the\nscarcity of publicly available datasets. Addressing this gap, we introduce a\nnew dataset in this domain, i.e. Multi-modality dataset for crop biomass\nestimation (MMCBE). Comprising 216 sets of multi-view drone images, coupled\nwith LiDAR point clouds, and hand-labelled ground truth, MMCBE represents the\nfirst multi-modality one in the field. This dataset aims to establish benchmark\nmethods for crop biomass quantification and foster the development of\nvision-based approaches. We have rigorously evaluated state-of-the-art crop\nbiomass estimation methods using MMCBE and ventured into additional potential\napplications, such as 3D crop reconstruction from drone imagery and novel-view\nrendering. With this publication, we are making our comprehensive dataset\navailable to the broader community.\n","authors":["Xuesong Li","Zeeshan Hayder","Ali Zia","Connor Cassidy","Shiming Liu","Warwick Stiller","Eric Stone","Warren Conaty","Lars Petersson","Vivien Rolland"],"pdf_url":"https://arxiv.org/pdf/2404.11256v3.pdf","comment":"10 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2409.08444v1","updated":"2024-09-13T00:26:09Z","published":"2024-09-13T00:26:09Z","title":"Towards Unified Facial Action Unit Recognition Framework by Large\n Language Models","summary":" Facial Action Units (AUs) are of great significance in the realm of affective\ncomputing. In this paper, we propose AU-LLaVA, the first unified AU recognition\nframework based on the Large Language Model (LLM). AU-LLaVA consists of a\nvisual encoder, a linear projector layer, and a pre-trained LLM. We\nmeticulously craft the text descriptions and fine-tune the model on various AU\ndatasets, allowing it to generate different formats of AU recognition results\nfor the same input image. On the BP4D and DISFA datasets, AU-LLaVA delivers the\nmost accurate recognition results for nearly half of the AUs. Our model\nachieves improvements of F1-score up to 11.4% in specific AU recognition\ncompared to previous benchmark results. On the FEAFA dataset, our method\nachieves significant improvements over all 24 AUs compared to previous\nbenchmark results. AU-LLaVA demonstrates exceptional performance and\nversatility in AU recognition.\n","authors":["Guohong Hu","Xing Lan","Hanyu Jiang","Jiayi Lyu","Jian Xue"],"pdf_url":"https://arxiv.org/pdf/2409.08444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08443v1","updated":"2024-09-13T00:20:10Z","published":"2024-09-13T00:20:10Z","title":"CF-PRNet: Coarse-to-Fine Prototype Refining Network for Point Cloud\n Completion and Reconstruction","summary":" In modern agriculture, precise monitoring of plants and fruits is crucial for\ntasks such as high-throughput phenotyping and automated harvesting. This paper\naddresses the challenge of reconstructing accurate 3D shapes of fruits from\npartial views, which is common in agricultural settings. We introduce CF-PRNet,\na coarse-to-fine prototype refining network, leverages high-resolution 3D data\nduring the training phase but requires only a single RGB-D image for real-time\ninference. Our approach begins by extracting the incomplete point cloud data\nthat constructed from a partial view of a fruit with a series of convolutional\nblocks. The extracted features inform the generation of scaling vectors that\nrefine two sequentially constructed 3D mesh prototypes - one coarse and one\nfine-grained. This progressive refinement facilitates the detailed completion\nof the final point clouds, achieving detailed and accurate reconstructions.\nCF-PRNet demonstrates excellent performance metrics with a Chamfer Distance of\n3.78, an F1 Score of 66.76%, a Precision of 56.56%, and a Recall of 85.31%, and\nwin the first place in the Shape Completion and Reconstruction of Sweet Peppers\nChallenge.\n","authors":["Zhi Chen","Tianqi Wei","Zecheng Zhao","Jia Syuen Lim","Yadan Luo","Hu Zhang","Xin Yu","Scott Chapman","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.08443v1.pdf","comment":"Technical Report of the 1st place solution to CVPPA@ECCV2024: Shape\n Completion and Reconstruction of Sweet Peppers Challenge"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.09010v1","updated":"2024-09-13T17:38:47Z","published":"2024-09-13T17:38:47Z","title":"Contri(e)ve: Context + Retrieve for Scholarly Question Answering","summary":" Scholarly communication is a rapid growing field containing a wealth of\nknowledge. However, due to its unstructured and document format, it is\nchallenging to extract useful information from them through conventional\ndocument retrieval methods. Scholarly knowledge graphs solve this problem, by\nrepresenting the documents in a semantic network, providing, hidden insights,\nsummaries and ease of accessibility through queries. Naturally, question\nanswering for scholarly graphs expands the accessibility to a wider audience.\nBut some of the knowledge in this domain is still presented as unstructured\ntext, thus requiring a hybrid solution for question answering systems. In this\npaper, we present a two step solution using open source Large Language\nModel(LLM): Llama3.1 for Scholarly-QALD dataset. Firstly, we extract the\ncontext pertaining to the question from different structured and unstructured\ndata sources: DBLP, SemOpenAlex knowledge graphs and Wikipedia text. Secondly,\nwe implement prompt engineering to improve the information retrieval\nperformance of the LLM. Our approach achieved an F1 score of 40% and also\nobserved some anomalous responses from the LLM, that are discussed in the final\npart of the paper.\n","authors":["Kanchan Shivashankar","Nadine Steinmetz"],"pdf_url":"https://arxiv.org/pdf/2409.09010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08987v1","updated":"2024-09-13T17:03:56Z","published":"2024-09-13T17:03:56Z","title":"Comparative Analysis of Pretrained Audio Representations in Music\n Recommender Systems","summary":" Over the years, Music Information Retrieval (MIR) has proposed various models\npretrained on large amounts of music data. Transfer learning showcases the\nproven effectiveness of pretrained backend models with a broad spectrum of\ndownstream tasks, including auto-tagging and genre classification. However, MIR\npapers generally do not explore the efficiency of pretrained models for Music\nRecommender Systems (MRS). In addition, the Recommender Systems community tends\nto favour traditional end-to-end neural network learning over these models. Our\nresearch addresses this gap and evaluates the applicability of six pretrained\nbackend models (MusicFM, Music2Vec, MERT, EncodecMAE, Jukebox, and MusiCNN) in\nthe context of MRS. We assess their performance using three recommendation\nmodels: K-nearest neighbours (KNN), shallow neural network, and BERT4Rec. Our\nfindings suggest that pretrained audio representations exhibit significant\nperformance variability between traditional MIR tasks and MRS, indicating that\nvaluable aspects of musical information captured by backend models may differ\ndepending on the task. This study establishes a foundation for further\nexploration of pretrained audio representations to enhance music recommendation\nsystems.\n","authors":["Yan-Martin Tamm","Anna Aljanaki"],"pdf_url":"https://arxiv.org/pdf/2409.08987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08975v1","updated":"2024-09-13T16:48:39Z","published":"2024-09-13T16:48:39Z","title":"Accurate and Fast Estimation of Temporal Motifs using Path Sampling","summary":" Counting the number of small subgraphs, called motifs, is a fundamental\nproblem in social network analysis and graph mining. Many real-world networks\nare directed and temporal, where edges have timestamps. Motif counting in\ndirected, temporal graphs is especially challenging because there are a\nplethora of different kinds of patterns. Temporal motif counts reveal much\nricher information and there is a need for scalable algorithms for motif\ncounting.\n A major challenge in counting is that there can be trillions of temporal\nmotif matches even with a graph with only millions of vertices. Both the motifs\nand the input graphs can have multiple edges between two vertices, leading to a\ncombinatorial explosion problem. Counting temporal motifs involving just four\nvertices is not feasible with current state-of-the-art algorithms.\n We design an algorithm, TEACUPS, that addresses this problem using a novel\ntechnique of temporal path sampling. We combine a path sampling method with\ncarefully designed temporal data structures, to propose an efficient\napproximate algorithm for temporal motif counting. TEACUPS is an unbiased\nestimator with provable concentration behavior, which can be used to bound the\nestimation error. For a Bitcoin graph with hundreds of millions of edges,\nTEACUPS runs in less than 1 minute, while the exact counting algorithm takes\nmore than a day. We empirically demonstrate the accuracy of TEACUPS on large\ndatasets, showing an average of 30$\\times$ speedup (up to 2000$\\times$ speedup)\ncompared to existing GPU-based exact counting methods while preserving high\ncount estimation accuracy.\n","authors":["Yunjie Pan","Omkar Bhalerao","C. Seshadhri","Nishil Talati"],"pdf_url":"https://arxiv.org/pdf/2409.08975v1.pdf","comment":"Accepted for ICDM'24"},{"id":"http://arxiv.org/abs/2409.08934v1","updated":"2024-09-13T15:53:40Z","published":"2024-09-13T15:53:40Z","title":"Proactive Recommendation in Social Networks: Steering User Interest via\n Neighbor Influence","summary":" Recommending items solely catering to users' historical interests narrows\nusers' horizons. Recent works have considered steering target users beyond\ntheir historical interests by directly adjusting items exposed to them.\nHowever, the recommended items for direct steering might not align perfectly\nwith users' interests evolution, detrimentally affecting target users'\nexperience. To avoid this issue, we propose a new task named Proactive\nRecommendation in Social Networks (PRSN) that indirectly steers users' interest\nby utilizing the influence of social neighbors, i.e., indirect steering by\nadjusting the exposure of a target item to target users' neighbors. The key to\nPRSN lies in answering an interventional question: what would a target user's\nfeedback be on a target item if the item is exposed to the user's different\nneighbors? To answer this question, we resort to causal inference and formalize\nPRSN as: (1) estimating the potential feedback of a user on an item, under the\nnetwork interference by the item's exposure to the user's neighbors; and (2)\nadjusting the exposure of a target item to target users' neighbors to trade off\nsteering performance and the damage to the neighbors' experience. To this end,\nwe propose a Neighbor Interference Recommendation (NIRec) framework with two\nkey modules: (1)an interference representation-based estimation module for\nmodeling potential feedback; and (2) a post-learning-based optimization module\nfor optimizing a target item's exposure to trade off steering performance and\nthe neighbors' experience by greedy search. We conduct extensive\nsemi-simulation experiments based on three real-world datasets, validating the\nsteering effectiveness of NIRec.\n","authors":["Hang Pan","Shuxian Bi","Wenjie Wang","Haoxuan Li","Peng Wu","Fuli Feng","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2409.08934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08931v1","updated":"2024-09-13T15:47:50Z","published":"2024-09-13T15:47:50Z","title":"LLM-based Weak Supervision Framework for Query Intent Classification in\n Video Search","summary":" Streaming services have reshaped how we discover and engage with digital\nentertainment. Despite these advancements, effectively understanding the wide\nspectrum of user search queries continues to pose a significant challenge. An\naccurate query understanding system that can handle a variety of entities that\nrepresent different user intents is essential for delivering an enhanced user\nexperience. We can build such a system by training a natural language\nunderstanding (NLU) model; however, obtaining high-quality labeled training\ndata in this specialized domain is a substantial obstacle. Manual annotation is\ncostly and impractical for capturing users' vast vocabulary variations. To\naddress this, we introduce a novel approach that leverages large language\nmodels (LLMs) through weak supervision to automatically annotate a vast\ncollection of user search queries. Using prompt engineering and a diverse set\nof LLM personas, we generate training data that matches human annotator\nexpectations. By incorporating domain knowledge via Chain of Thought and\nIn-Context Learning, our approach leverages the labeled data to train\nlow-latency models optimized for real-time inference. Extensive evaluations\ndemonstrated that our approach outperformed the baseline with an average\nrelative gain of 113% in recall. Furthermore, our novel prompt engineering\nframework yields higher quality LLM-generated data to be used for weak\nsupervision; we observed 47.60% improvement over baseline in agreement rate\nbetween LLM predictions and human annotations with respect to F1 score,\nweighted according to the distribution of occurrences of the search queries.\nOur persona selection routing mechanism further adds an additional 3.67%\nincrease in weighted F1 score on top of our novel prompt engineering framework.\n","authors":["Farnoosh Javadi","Phanideep Gampa","Alyssa Woo","Xingxing Geng","Hang Zhang","Jose Sepulveda","Belhassen Bayar","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08931v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.05462v2","updated":"2024-09-13T13:36:40Z","published":"2024-09-09T09:42:46Z","title":"Federated Transfer Learning Based Cooperative Wideband Spectrum Sensing\n with Model Pruning","summary":" For ultra-wideband and high-rate wireless communication systems, wideband\nspectrum sensing (WSS) is critical, since it empowers secondary users (SUs) to\ncapture the spectrum holes for opportunistic transmission. However, WSS\nencounters challenges such as excessive costs of hardware and computation due\nto the high sampling rate, as well as robustness issues arising from scenario\nmismatch. In this paper, a WSS neural network (WSSNet) is proposed by\nexploiting multicoset preprocessing to enable the sub-Nyquist sampling, with\nthe two dimensional convolution design specifically tailored to work with the\npreprocessed samples. A federated transfer learning (FTL) based framework\nmobilizing multiple SUs is further developed to achieve a robust model\nadaptable to various scenarios, which is paved by the selective weight pruning\nfor the fast model adaptation and inference. Simulation results demonstrate\nthat the proposed FTL-WSSNet achieves the fairly good performance in different\ntarget scenarios even without local adaptation samples.\n","authors":["Jibin Jia","Peihao Dong","Fuhui Zhou","Qihui Wu"],"pdf_url":"https://arxiv.org/pdf/2409.05462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07272v2","updated":"2024-09-13T12:03:06Z","published":"2024-09-11T13:46:52Z","title":"RePlay: a Recommendation Framework for Experimentation and Production\n Use","summary":" Using a single tool to build and compare recommender systems significantly\nreduces the time to market for new models. In addition, the comparison results\nwhen using such tools look more consistent. This is why many different tools\nand libraries for researchers in the field of recommendations have recently\nappeared. Unfortunately, most of these frameworks are aimed primarily at\nresearchers and require modification for use in production due to the inability\nto work on large datasets or an inappropriate architecture. In this demo, we\npresent our open-source toolkit RePlay - a framework containing an end-to-end\npipeline for building recommender systems, which is ready for production use.\nRePlay also allows you to use a suitable stack for the pipeline on each stage:\nPandas, Polars, or Spark. This allows the library to scale computations and\ndeploy to a cluster. Thus, RePlay allows data scientists to easily move from\nresearch mode to production mode using the same interfaces.\n","authors":["Alexey Vasilev","Anna Volodkevich","Denis Kulandin","Tatiana Bysheva","Anton Klenitskiy"],"pdf_url":"https://arxiv.org/pdf/2409.07272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08703v1","updated":"2024-09-13T10:43:18Z","published":"2024-09-13T10:43:18Z","title":"NeSHFS: Neighborhood Search with Heuristic-based Feature Selection for\n Click-Through Rate Prediction","summary":" Click-through-rate (CTR) prediction plays an important role in online\nadvertising and ad recommender systems. In the past decade, maximizing CTR has\nbeen the main focus of model development and solution creation. Therefore,\nresearchers and practitioners have proposed various models and solutions to\nenhance the effectiveness of CTR prediction. Most of the existing literature\nfocuses on capturing either implicit or explicit feature interactions. Although\nimplicit interactions are successfully captured in some studies, explicit\ninteractions present a challenge for achieving high CTR by extracting both\nlow-order and high-order feature interactions. Unnecessary and irrelevant\nfeatures may cause high computational time and low prediction performance.\nFurthermore, certain features may perform well with specific predictive models\nwhile underperforming with others. Also, feature distribution may fluctuate due\nto traffic variations. Most importantly, in live production environments,\nresources are limited, and the time for inference is just as crucial as\ntraining time. Because of all these reasons, feature selection is one of the\nmost important factors in enhancing CTR prediction model performance. Simple\nfilter-based feature selection algorithms do not perform well and they are not\nsufficient. An effective and efficient feature selection algorithm is needed to\nconsistently filter the most useful features during live CTR prediction\nprocess. In this paper, we propose a heuristic algorithm named Neighborhood\nSearch with Heuristic-based Feature Selection (NeSHFS) to enhance CTR\nprediction performance while reducing dimensionality and training time costs.\nWe conduct comprehensive experiments on three public datasets to validate the\nefficiency and effectiveness of our proposed solution.\n","authors":["Dogukan Aksu","Ismail Hakki Toroslu","Hasan Davulcu"],"pdf_url":"https://arxiv.org/pdf/2409.08703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08543v1","updated":"2024-09-13T05:33:09Z","published":"2024-09-13T05:33:09Z","title":"ATFLRec: A Multimodal Recommender System with Audio-Text Fusion and\n Low-Rank Adaptation via Instruction-Tuned Large Language Model","summary":" Recommender Systems (RS) play a pivotal role in boosting user satisfaction by\nproviding personalized product suggestions in domains such as e-commerce and\nentertainment. This study examines the integration of multimodal data text and\naudio into large language models (LLMs) with the aim of enhancing\nrecommendation performance. Traditional text and audio recommenders encounter\nlimitations such as the cold-start problem, and recent advancements in LLMs,\nwhile promising, are computationally expensive. To address these issues,\nLow-Rank Adaptation (LoRA) is introduced, which enhances efficiency without\ncompromising performance. The ATFLRec framework is proposed to integrate audio\nand text modalities into a multimodal recommendation system, utilizing various\nLoRA configurations and modality fusion techniques. Results indicate that\nATFLRec outperforms baseline models, including traditional and graph neural\nnetwork-based approaches, achieving higher AUC scores. Furthermore, separate\nfine-tuning of audio and text data with distinct LoRA modules yields optimal\nperformance, with different pooling methods and Mel filter bank numbers\nsignificantly impacting performance. This research offers valuable insights\ninto optimizing multimodal recommender systems and advancing the integration of\ndiverse data modalities in LLMs.\n","authors":["Zezheng Qin"],"pdf_url":"https://arxiv.org/pdf/2409.08543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07276v2","updated":"2024-09-13T04:16:55Z","published":"2024-09-11T13:49:48Z","title":"STORE: Streamlining Semantic Tokenization and Generative Recommendation\n with A Single LLM","summary":" Traditional recommendation models often rely on unique item identifiers (IDs)\nto distinguish between items, which can hinder their ability to effectively\nleverage item content information and generalize to long-tail or cold-start\nitems. Recently, semantic tokenization has been proposed as a promising\nsolution that aims to tokenize each item's semantic representation into a\nsequence of discrete tokens. In this way, it preserves the item's semantics\nwithin these tokens and ensures that semantically similar items are represented\nby similar tokens. These semantic tokens have become fundamental in training\ngenerative recommendation models. However, existing generative recommendation\nmethods typically involve multiple sub-models for embedding, quantization, and\nrecommendation, leading to an overly complex system. In this paper, we propose\nto streamline the semantic tokenization and generative recommendation process\nwith a unified framework, dubbed STORE, which leverages a single large language\nmodel (LLM) for both tasks. Specifically, we formulate semantic tokenization as\na text-to-token task and generative recommendation as a token-to-token task,\nsupplemented by a token-to-text reconstruction task and a text-to-token\nauxiliary task. All these tasks are framed in a generative manner and trained\nusing a single LLM backbone. Extensive experiments have been conducted to\nvalidate the effectiveness of our STORE framework across various recommendation\ntasks and datasets. We will release the source code and configurations for\nreproducible research.\n","authors":["Qijiong Liu","Jieming Zhu","Lu Fan","Zhou Zhao","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2409.07276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08479v1","updated":"2024-09-13T02:08:47Z","published":"2024-09-13T02:08:47Z","title":"Exploring Information Retrieval Landscapes: An Investigation of a Novel\n Evaluation Techniques and Comparative Document Splitting Methods","summary":" The performance of Retrieval-Augmented Generation (RAG) systems in\ninformation retrieval is significantly influenced by the characteristics of the\ndocuments being processed. In this study, the structured nature of textbooks,\nthe conciseness of articles, and the narrative complexity of novels are shown\nto require distinct retrieval strategies. A comparative evaluation of multiple\ndocument-splitting methods reveals that the Recursive Character Splitter\noutperforms the Token-based Splitter in preserving contextual integrity. A\nnovel evaluation technique is introduced, utilizing an open-source model to\ngenerate a comprehensive dataset of question-and-answer pairs, simulating\nrealistic retrieval scenarios to enhance testing efficiency and metric\nreliability. The evaluation employs weighted scoring metrics, including\nSequenceMatcher, BLEU, METEOR, and BERT Score, to assess the system's accuracy\nand relevance. This approach establishes a refined standard for evaluating the\nprecision of RAG systems, with future research focusing on optimizing chunk and\noverlap sizes to improve retrieval accuracy and efficiency.\n","authors":["Esmaeil Narimissa","David Raithel"],"pdf_url":"https://arxiv.org/pdf/2409.08479v1.pdf","comment":"This article is 16 pages long and includes detailed comparisons of\n RAG systems and document splitting techniques"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.09032v1","updated":"2024-09-13T17:59:52Z","published":"2024-09-13T17:59:52Z","title":"The unknotting number, hard unknot diagrams, and reinforcement learning","summary":" We have developed a reinforcement learning agent that often finds a minimal\nsequence of unknotting crossing changes for a knot diagram with up to 200\ncrossings, hence giving an upper bound on the unknotting number. We have used\nthis to determine the unknotting number of 57k knots. We took diagrams of\nconnected sums of such knots with oppositely signed signatures, where the\nsummands were overlaid. The agent has found examples where several of the\ncrossing changes in an unknotting collection of crossings result in hyperbolic\nknots. Based on this, we have shown that, given knots $K$ and $K'$ that satisfy\nsome mild assumptions, there is a diagram of their connected sum and $u(K) +\nu(K')$ unknotting crossings such that changing any one of them results in a\nprime knot. As a by-product, we have obtained a dataset of 2.6 million distinct\nhard unknot diagrams; most of them under 35 crossings. Assuming the additivity\nof the unknotting number, we have determined the unknotting number of 43 at\nmost 12-crossing knots for which the unknotting number is unknown.\n","authors":["Taylor Applebaum","Sam Blackwell","Alex Davies","Thomas Edlich","András Juhász","Marc Lackenby","Nenad Tomašev","Daniel Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.09032v1.pdf","comment":"29 pages, 17 figures"},{"id":"http://arxiv.org/abs/2409.09021v1","updated":"2024-09-13T17:48:48Z","published":"2024-09-13T17:48:48Z","title":"INN-PAR: Invertible Neural Network for PPG to ABP Reconstruction","summary":" Non-invasive and continuous blood pressure (BP) monitoring is essential for\nthe early prevention of many cardiovascular diseases. Estimating arterial blood\npressure (ABP) from photoplethysmography (PPG) has emerged as a promising\nsolution. However, existing deep learning approaches for PPG-to-ABP\nreconstruction (PAR) encounter certain information loss, impacting the\nprecision of the reconstructed signal. To overcome this limitation, we\nintroduce an invertible neural network for PPG to ABP reconstruction (INN-PAR),\nwhich employs a series of invertible blocks to jointly learn the mapping\nbetween PPG and its gradient with the ABP signal and its gradient. INN-PAR\nefficiently captures both forward and inverse mappings simultaneously, thereby\npreventing information loss. By integrating signal gradients into the learning\nprocess, INN-PAR enhances the network's ability to capture essential\nhigh-frequency details, leading to more accurate signal reconstruction.\nMoreover, we propose a multi-scale convolution module (MSCM) within the\ninvertible block, enabling the model to learn features across multiple scales\neffectively. We have experimented on two benchmark datasets, which show that\nINN-PAR significantly outperforms the state-of-the-art methods in both waveform\nreconstruction and BP measurement accuracy.\n","authors":["Soumitra Kundu","Gargi Panda","Saumik Bhattacharya","Aurobinda Routray","Rajlakshmi Guha"],"pdf_url":"https://arxiv.org/pdf/2409.09021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09018v1","updated":"2024-09-13T17:45:53Z","published":"2024-09-13T17:45:53Z","title":"An Efficient and Streaming Audio Visual Active Speaker Detection System","summary":" This paper delves into the challenging task of Active Speaker Detection\n(ASD), where the system needs to determine in real-time whether a person is\nspeaking or not in a series of video frames. While previous works have made\nsignificant strides in improving network architectures and learning effective\nrepresentations for ASD, a critical gap exists in the exploration of real-time\nsystem deployment. Existing models often suffer from high latency and memory\nusage, rendering them impractical for immediate applications. To bridge this\ngap, we present two scenarios that address the key challenges posed by\nreal-time constraints. First, we introduce a method to limit the number of\nfuture context frames utilized by the ASD model. By doing so, we alleviate the\nneed for processing the entire sequence of future frames before a decision is\nmade, significantly reducing latency. Second, we propose a more stringent\nconstraint that limits the total number of past frames the model can access\nduring inference. This tackles the persistent memory issues associated with\nrunning streaming ASD systems. Beyond these theoretical frameworks, we conduct\nextensive experiments to validate our approach. Our results demonstrate that\nconstrained transformer models can achieve performance comparable to or even\nbetter than state-of-the-art recurrent models, such as uni-directional GRUs,\nwith a significantly reduced number of context frames. Moreover, we shed light\non the temporal memory requirements of ASD systems, revealing that larger past\ncontext has a more profound impact on accuracy than future context. When\nprofiling on a CPU we find that our efficient architecture is memory bound by\nthe amount of past context it can use and that the compute cost is negligible\nas compared to the memory cost.\n","authors":["Arnav Kundu","Yanzi Jin","Mohammad Sekhavat","Max Horton","Danny Tormoen","Devang Naik"],"pdf_url":"https://arxiv.org/pdf/2409.09018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09011v1","updated":"2024-09-13T17:40:01Z","published":"2024-09-13T17:40:01Z","title":"VAE Explainer: Supplement Learning Variational Autoencoders with\n Interactive Visualization","summary":" Variational Autoencoders are widespread in Machine Learning, but are\ntypically explained with dense math notation or static code examples. This\npaper presents VAE Explainer, an interactive Variational Autoencoder running in\nthe browser to supplement existing static documentation (e.g., Keras Code\nExamples). VAE Explainer adds interactions to the VAE summary with interactive\nmodel inputs, latent space, and output. VAE Explainer connects the high-level\nunderstanding with the implementation: annotated code and a live computational\ngraph. The VAE Explainer interactive visualization is live at\nhttps://xnought.github.io/vae-explainer and the code is open source at\nhttps://github.com/xnought/vae-explainer.\n","authors":["Donald Bertucci","Alex Endert"],"pdf_url":"https://arxiv.org/pdf/2409.09011v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.09007v1","updated":"2024-09-13T17:37:34Z","published":"2024-09-13T17:37:34Z","title":"SGFormer: Single-Layer Graph Transformers with Approximation-Free Linear\n Complexity","summary":" Learning representations on large graphs is a long-standing challenge due to\nthe inter-dependence nature. Transformers recently have shown promising\nperformance on small graphs thanks to its global attention for capturing\nall-pair interactions beyond observed structures. Existing approaches tend to\ninherit the spirit of Transformers in language and vision tasks, and embrace\ncomplicated architectures by stacking deep attention-based propagation layers.\nIn this paper, we attempt to evaluate the necessity of adopting multi-layer\nattentions in Transformers on graphs, which considerably restricts the\nefficiency. Specifically, we analyze a generic hybrid propagation layer,\ncomprised of all-pair attention and graph-based propagation, and show that\nmulti-layer propagation can be reduced to one-layer propagation, with the same\ncapability for representation learning. It suggests a new technical path for\nbuilding powerful and efficient Transformers on graphs, particularly through\nsimplifying model architectures without sacrificing expressiveness. As\nexemplified by this work, we propose a Simplified Single-layer Graph\nTransformers (SGFormer), whose main component is a single-layer global\nattention that scales linearly w.r.t. graph sizes and requires none of any\napproximation for accommodating all-pair interactions. Empirically, SGFormer\nsuccessfully scales to the web-scale graph ogbn-papers100M, yielding\norders-of-magnitude inference acceleration over peer Transformers on\nmedium-sized graphs, and demonstrates competitiveness with limited labeled\ndata.\n","authors":["Qitian Wu","Kai Yang","Hengrui Zhang","David Wipf","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2409.09007v1.pdf","comment":"Extended version of NeurIPS2023 contribution arXiv:2306.10759"},{"id":"http://arxiv.org/abs/2409.09003v1","updated":"2024-09-13T17:32:05Z","published":"2024-09-13T17:32:05Z","title":"Model-independent variable selection via the rule-based variable priorit","summary":" While achieving high prediction accuracy is a fundamental goal in machine\nlearning, an equally important task is finding a small number of features with\nhigh explanatory power. One popular selection technique is permutation\nimportance, which assesses a variable's impact by measuring the change in\nprediction error after permuting the variable. However, this can be problematic\ndue to the need to create artificial data, a problem shared by other methods as\nwell. Another problem is that variable selection methods can be limited by\nbeing model-specific. We introduce a new model-independent approach, Variable\nPriority (VarPro), which works by utilizing rules without the need to generate\nartificial data or evaluate prediction error. The method is relatively easy to\nuse, requiring only the calculation of sample averages of simple statistics,\nand can be applied to many data settings, including regression, classification,\nand survival. We investigate the asymptotic properties of VarPro and show,\namong other things, that VarPro has a consistent filtering property for noise\nvariables. Empirical studies using synthetic and real-world data show the\nmethod achieves a balanced performance and compares favorably to many\nstate-of-the-art procedures currently used for variable selection.\n","authors":["Min Lu","Hemant Ishwaran"],"pdf_url":"https://arxiv.org/pdf/2409.09003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08997v1","updated":"2024-09-13T17:23:42Z","published":"2024-09-13T17:23:42Z","title":"Biomimetic Frontend for Differentiable Audio Processing","summary":" While models in audio and speech processing are becoming deeper and more\nend-to-end, they as a consequence need expensive training on large data, and\nare often brittle. We build on a classical model of human hearing and make it\ndifferentiable, so that we can combine traditional explainable biomimetic\nsignal processing approaches with deep-learning frameworks. This allows us to\narrive at an expressive and explainable model that is easily trained on modest\namounts of data. We apply this model to audio processing tasks, including\nclassification and enhancement. Results show that our differentiable model\nsurpasses black-box approaches in terms of computational efficiency and\nrobustness, even with little training data. We also discuss other potential\napplications.\n","authors":["Ruolan Leslie Famularo","Dmitry N. Zotkin","Shihab A. Shamma","Ramani Duraiswami"],"pdf_url":"https://arxiv.org/pdf/2409.08997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00045v2","updated":"2024-09-13T16:59:00Z","published":"2024-03-25T04:45:28Z","title":"Policy Optimization finds Nash Equilibrium in Regularized General-Sum LQ\n Games","summary":" In this paper, we investigate the impact of introducing relative entropy\nregularization on the Nash Equilibria (NE) of General-Sum $N$-agent games,\nrevealing the fact that the NE of such games conform to linear Gaussian\npolicies. Moreover, it delineates sufficient conditions, contingent upon the\nadequacy of entropy regularization, for the uniqueness of the NE within the\ngame. As Policy Optimization serves as a foundational approach for\nReinforcement Learning (RL) techniques aimed at finding the NE, in this work we\nprove the linear convergence of a policy optimization algorithm which (subject\nto the adequacy of entropy regularization) is capable of provably attaining the\nNE. Furthermore, in scenarios where the entropy regularization proves\ninsufficient, we present a $\\delta$-augmentation technique, which facilitates\nthe achievement of an $\\epsilon$-NE within the game.\n","authors":["Muhammad Aneeq uz Zaman","Shubham Aggarwal","Melih Bastopcu","Tamer Başar"],"pdf_url":"https://arxiv.org/pdf/2404.00045v2.pdf","comment":"Accepted for Conference on Decision and Control 2024"},{"id":"http://arxiv.org/abs/2409.08985v1","updated":"2024-09-13T16:58:06Z","published":"2024-09-13T16:58:06Z","title":"Clean Label Attacks against SLU Systems","summary":" Poisoning backdoor attacks involve an adversary manipulating the training\ndata to induce certain behaviors in the victim model by inserting a trigger in\nthe signal at inference time. We adapted clean label backdoor (CLBD)-data\npoisoning attacks, which do not modify the training labels, on state-of-the-art\nspeech recognition models that support/perform a Spoken Language Understanding\ntask, achieving 99.8% attack success rate by poisoning 10% of the training\ndata. We analyzed how varying the signal-strength of the poison, percent of\nsamples poisoned, and choice of trigger impact the attack. We also found that\nCLBD attacks are most successful when applied to training samples that are\ninherently hard for a proxy model. Using this strategy, we achieved an attack\nsuccess rate of 99.3% by poisoning a meager 1.5% of the training data. Finally,\nwe applied two previously developed defenses against gradient-based attacks,\nand found that they attain mixed success against poisoning.\n","authors":["Henry Li Xinyuan","Sonal Joshi","Thomas Thebaud","Jesus Villalba","Najim Dehak","Sanjeev Khudanpur"],"pdf_url":"https://arxiv.org/pdf/2409.08985v1.pdf","comment":"Accepted at IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2310.03146v3","updated":"2024-09-13T16:55:40Z","published":"2023-10-04T20:18:45Z","title":"Fairness-enhancing mixed effects deep learning improves fairness on in-\n and out-of-distribution clustered (non-iid) data","summary":" Traditional deep learning (DL) models face two key challenges. First, they\nassume training samples are independent and identically distributed, an\nassumption often violated in real-world datasets where samples are grouped by\nshared measurements (e.g., participants or cells). This leads to performance\ndegradation, limited generalization, and confounding issues, causing Type 1 and\nType 2 errors. Second, DL models typically prioritize overall accuracy, often\noverlooking fairness across underrepresented groups, leading to biased outcomes\nin critical areas such as loan approvals and healthcare decisions. To address\nthese issues, we introduce the Fair Mixed Effects Deep Learning (Fair MEDL)\nframework. Fair MEDL quantifies cluster-invariant fixed effects (FE) and\ncluster-specific random effects (RE) through 1) a cluster adversary for\nlearning invariant FE, 2) a Bayesian neural network for RE, and 3) a mixing\nfunction combining FE and RE for final predictions. Additionally, we\nincorporate adversarial debiasing to promote fairness across three key metrics:\nEqualized Odds, Demographic Parity, and Counterfactual Fairness. Our method\nalso identifies and de-weights confounding probes, improving interpretability.\nEvaluated on three datasets from finance and healthcare, Fair MEDL improves\nfairness by up to 73% for age, 47% for race, 83% for sex, and 26% for marital\nstatus, while maintaining robust predictive performance. Our implementation is\npublicly available on GitHub.\n","authors":["Son Nguyen","Adam Wang","Albert Montillo"],"pdf_url":"https://arxiv.org/pdf/2310.03146v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08980v1","updated":"2024-09-13T16:52:24Z","published":"2024-09-13T16:52:24Z","title":"Predicting Trust In Autonomous Vehicles: Modeling Young Adult\n Psychosocial Traits, Risk-Benefit Attitudes, And Driving Factors With Machine\n Learning","summary":" Low trust remains a significant barrier to Autonomous Vehicle (AV) adoption.\nTo design trustworthy AVs, we need to better understand the individual traits,\nattitudes, and experiences that impact people's trust judgements. We use\nmachine learning to understand the most important factors that contribute to\nyoung adult trust based on a comprehensive set of personal factors gathered via\nsurvey (n = 1457). Factors ranged from psychosocial and cognitive attributes to\ndriving style, experiences, and perceived AV risks and benefits. Using the\nexplainable AI technique SHAP, we found that perceptions of AV risks and\nbenefits, attitudes toward feasibility and usability, institutional trust,\nprior experience, and a person's mental model are the most important\npredictors. Surprisingly, psychosocial and many technology- and\ndriving-specific factors were not strong predictors. Results highlight the\nimportance of individual differences for designing trustworthy AVs for diverse\ngroups and lead to key implications for future design and research.\n","authors":["Robert Kaufman","Emi Lee","Manas Satish Bedmutha","David Kirsh","Nadir Weibel"],"pdf_url":"https://arxiv.org/pdf/2409.08980v1.pdf","comment":"31 pages (including references and appendix), 7 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.08958v1","updated":"2024-09-13T16:23:17Z","published":"2024-09-13T16:23:17Z","title":"PINNfluence: Influence Functions for Physics-Informed Neural Networks","summary":" Recently, physics-informed neural networks (PINNs) have emerged as a flexible\nand promising application of deep learning to partial differential equations in\nthe physical sciences. While offering strong performance and competitive\ninference speeds on forward and inverse problems, their black-box nature limits\ninterpretability, particularly regarding alignment with expected physical\nbehavior. In the present work, we explore the application of influence\nfunctions (IFs) to validate and debug PINNs post-hoc. Specifically, we apply\nvariations of IF-based indicators to gauge the influence of different types of\ncollocation points on the prediction of PINNs applied to a 2D Navier-Stokes\nfluid flow problem. Our results demonstrate how IFs can be adapted to PINNs to\nreveal the potential for further studies.\n","authors":["Jonas R. Naujoks","Aleksander Krasowski","Moritz Weckbecker","Thomas Wiegand","Sebastian Lapuschkin","Wojciech Samek","René P. Klausen"],"pdf_url":"https://arxiv.org/pdf/2409.08958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10984v2","updated":"2024-09-13T16:21:58Z","published":"2024-03-16T17:32:59Z","title":"IoTCO2: Assessing the End-To-End Carbon Footprint of\n Internet-of-Things-Enabled Deep Learning","summary":" To improve privacy and ensure quality-of-service (QoS), deep learning (DL)\nmodels are increasingly deployed on Internet of Things (IoT) devices for data\nprocessing, significantly increasing the carbon footprint associated with DL on\nIoT, covering both operational and embodied aspects. Existing operational\nenergy predictors often overlook quantized DL models and emerging neural\nprocessing units (NPUs), while embodied carbon footprint modeling tools neglect\nnon-computing hardware components common in IoT devices, creating a gap in\naccurate carbon footprint modeling tools for IoT-enabled DL. This paper\nintroduces \\textit{\\carb}, an end-to-end tool for precise carbon footprint\nestimation in IoT-enabled DL, with deviations as low as 5\\% for operational and\n3.23\\% for embodied carbon footprints compared to actual measurements across\nvarious DL models. Additionally, practical applications of \\carb~are showcased\nthrough multiple user case studies.\n","authors":["Fan Chen","Shahzeen Attari","Gayle Buck","Lei Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.10984v2.pdf","comment":"5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2409.08954v1","updated":"2024-09-13T16:14:54Z","published":"2024-09-13T16:14:54Z","title":"A Bayesian Approach to Clustering via the Proper Bayesian Bootstrap: the\n Bayesian Bagged Clustering (BBC) algorithm","summary":" The paper presents a novel approach for unsupervised techniques in the field\nof clustering. A new method is proposed to enhance existing literature models\nusing the proper Bayesian bootstrap to improve results in terms of robustness\nand interpretability. Our approach is organized in two steps: k-means\nclustering is used for prior elicitation, then proper Bayesian bootstrap is\napplied as resampling method in an ensemble clustering approach. Results are\nanalyzed introducing measures of uncertainty based on Shannon entropy. The\nproposal provides clear indication on the optimal number of clusters, as well\nas a better representation of the clustered data. Empirical results are\nprovided on simulated data showing the methodological and empirical advances\nobtained.\n","authors":["Federico Maria Quetti","Silvia Figini","Elena ballante"],"pdf_url":"https://arxiv.org/pdf/2409.08954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08946v1","updated":"2024-09-13T16:06:18Z","published":"2024-09-13T16:06:18Z","title":"DELTA: Dual Consistency Delving with Topological Uncertainty for Active\n Graph Domain Adaptation","summary":" Graph domain adaptation has recently enabled knowledge transfer across\ndifferent graphs. However, without the semantic information on target graphs,\nthe performance on target graphs is still far from satisfactory. To address the\nissue, we study the problem of active graph domain adaptation, which selects a\nsmall quantitative of informative nodes on the target graph for extra\nannotation. This problem is highly challenging due to the complicated\ntopological relationships and the distribution discrepancy across graphs. In\nthis paper, we propose a novel approach named Dual Consistency Delving with\nTopological Uncertainty (DELTA) for active graph domain adaptation. Our DELTA\nconsists of an edge-oriented graph subnetwork and a path-oriented graph\nsubnetwork, which can explore topological semantics from complementary\nperspectives. In particular, our edge-oriented graph subnetwork utilizes the\nmessage passing mechanism to learn neighborhood information, while our\npath-oriented graph subnetwork explores high-order relationships from\nsubstructures. To jointly learn from two subnetworks, we roughly select\ninformative candidate nodes with the consideration of consistency across two\nsubnetworks. Then, we aggregate local semantics from its K-hop subgraph based\non node degrees for topological uncertainty estimation. To overcome potential\ndistribution shifts, we compare target nodes and their corresponding source\nnodes for discrepancy scores as an additional component for fine selection.\nExtensive experiments on benchmark datasets demonstrate that DELTA outperforms\nvarious state-of-the-art approaches.\n","authors":["Pengyun Wang","Yadi Cao","Chris Russell","Siyu Heng","Junyu Luo","Yanxin Shen","Xiao Luo"],"pdf_url":"https://arxiv.org/pdf/2409.08946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08938v1","updated":"2024-09-13T15:56:26Z","published":"2024-09-13T15:56:26Z","title":"Average-Reward Maximum Entropy Reinforcement Learning for Underactuated\n Double Pendulum Tasks","summary":" This report presents a solution for the swing-up and stabilisation tasks of\nthe acrobot and the pendubot, developed for the AI Olympics competition at IROS\n2024. Our approach employs the Average-Reward Entropy Advantage Policy\nOptimization (AR-EAPO), a model-free reinforcement learning (RL) algorithm that\ncombines average-reward RL and maximum entropy RL. Results demonstrate that our\ncontroller achieves improved performance and robustness scores compared to\nestablished baseline methods in both the acrobot and pendubot scenarios,\nwithout the need for a heavily engineered reward function or system model. The\ncurrent results are applicable exclusively to the simulation stage setup.\n","authors":["Jean Seong Bjorn Choe","Bumkyu Choi","Jong-kook Kim"],"pdf_url":"https://arxiv.org/pdf/2409.08938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08935v1","updated":"2024-09-13T15:55:05Z","published":"2024-09-13T15:55:05Z","title":"Optimization and Generalization Guarantees for Weight Normalization","summary":" Weight normalization (WeightNorm) is widely used in practice for the training\nof deep neural networks and modern deep learning libraries have built-in\nimplementations of it. In this paper, we provide the first theoretical\ncharacterizations of both optimization and generalization of deep WeightNorm\nmodels with smooth activation functions. For optimization, from the form of the\nHessian of the loss, we note that a small Hessian of the predictor leads to a\ntractable analysis. Thus, we bound the spectral norm of the Hessian of\nWeightNorm networks and show its dependence on the network width and weight\nnormalization terms--the latter being unique to networks without WeightNorm.\nThen, we use this bound to establish training convergence guarantees under\nsuitable assumptions for gradient decent. For generalization, we use WeightNorm\nto get a uniform convergence based generalization bound, which is independent\nfrom the width and depends sublinearly on the depth. Finally, we present\nexperimental results which illustrate how the normalization terms and other\nquantities of theoretical interest relate to the training of WeightNorm\nnetworks.\n","authors":["Pedro Cisneros-Velarde","Zhijie Chen","Sanmi Koyejo","Arindam Banerjee"],"pdf_url":"https://arxiv.org/pdf/2409.08935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16218v2","updated":"2024-09-13T15:45:10Z","published":"2024-03-24T16:18:27Z","title":"CoverUp: Coverage-Guided LLM-Based Test Generation","summary":" Testing is an essential part of software development. Test generation tools\nattempt to automate the otherwise labor-intensive task of test creation, but\ngenerating high-coverage tests remains a challenge. This paper proposes\nCoverUp, a novel approach to driving the generation of high-coverage Python\nregression tests. CoverUp iteratively improves test coverage, interleaving\ncoverage analysis with dialogs with the LLM that steer it to refine tests so\nthat they increase coverage of lines and branches. We evaluate our prototype\nCoverUp implementation across a benchmark of challenging code derived from\nopen-source Python projects, and show that CoverUp substantially improves on\nthe state of the art. Compared to CodaMosa, a hybrid search/LLM-based test\ngenerator, CoverUp achieves a per-module median line+branch coverage of 80%\n(vs. 47%). Compared to MuTAP, a mutation/LLM-based test generator, CoverUp\nachieves an overall line+branch coverage of 90% (vs. 77%). We show that\nCoverUp's iterative, coverage-guided approach is crucial to its effectiveness,\ncontributing to nearly 40% of its successes.\n","authors":["Juan Altmayer Pizzorno","Emery D. Berger"],"pdf_url":"https://arxiv.org/pdf/2403.16218v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2409.08925v1","updated":"2024-09-13T15:40:29Z","published":"2024-09-13T15:40:29Z","title":"Multi forests: Variable importance for multi-class outcomes","summary":" In prediction tasks with multi-class outcomes, identifying covariates\nspecifically associated with one or more outcome classes can be important.\nConventional variable importance measures (VIMs) from random forests (RFs),\nlike permutation and Gini importance, focus on overall predictive performance\nor node purity, without differentiating between the classes. Therefore, they\ncan be expected to fail to distinguish class-associated covariates from\ncovariates that only distinguish between groups of classes. We introduce a VIM\ncalled multi-class VIM, tailored for identifying exclusively class-associated\ncovariates, via a novel RF variant called multi forests (MuFs). The trees in\nMuFs use both multi-way and binary splitting. The multi-way splits generate\nchild nodes for each class, using a split criterion that evaluates how well\nthese nodes represent their respective classes. This setup forms the basis of\nthe multi-class VIM, which measures the discriminatory ability of the splits\nperformed in the respective covariates with regard to this split criterion.\nAlongside the multi-class VIM, we introduce a second VIM, the discriminatory\nVIM. This measure, based on the binary splits, assesses the strength of the\ngeneral influence of the covariates, irrespective of their\nclass-associatedness. Simulation studies demonstrate that the multi-class VIM\nspecifically ranks class-associated covariates highly, unlike conventional VIMs\nwhich also rank other types of covariates highly. Analyses of 121 datasets\nreveal that MuFs often have slightly lower predictive performance compared to\nconventional RFs. This is, however, not a limiting factor given the algorithm's\nprimary purpose of calculating the multi-class VIM.\n","authors":["Roman Hornung","Alexander Hapfelmeier"],"pdf_url":"https://arxiv.org/pdf/2409.08925v1.pdf","comment":"30 pages, 6 figures"},{"id":"http://arxiv.org/abs/2302.13080v3","updated":"2024-09-13T15:38:23Z","published":"2023-02-25T13:58:37Z","title":"Does a Neural Network Really Encode Symbolic Concepts?","summary":" Recently, a series of studies have tried to extract interactions between\ninput variables modeled by a DNN and define such interactions as concepts\nencoded by the DNN. However, strictly speaking, there still lacks a solid\nguarantee whether such interactions indeed represent meaningful concepts.\nTherefore, in this paper, we examine the trustworthiness of interaction\nconcepts from four perspectives. Extensive empirical studies have verified that\na well-trained DNN usually encodes sparse, transferable, and discriminative\nconcepts, which is partially aligned with human intuition.\n","authors":["Mingjie Li","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13080v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08919v1","updated":"2024-09-13T15:33:32Z","published":"2024-09-13T15:33:32Z","title":"XSub: Explanation-Driven Adversarial Attack against Blackbox Classifiers\n via Feature Substitution","summary":" Despite its significant benefits in enhancing the transparency and\ntrustworthiness of artificial intelligence (AI) systems, explainable AI (XAI)\nhas yet to reach its full potential in real-world applications. One key\nchallenge is that XAI can unintentionally provide adversaries with insights\ninto black-box models, inevitably increasing their vulnerability to various\nattacks. In this paper, we develop a novel explanation-driven adversarial\nattack against black-box classifiers based on feature substitution, called\nXSub. The key idea of XSub is to strategically replace important features\n(identified via XAI) in the original sample with corresponding important\nfeatures from a \"golden sample\" of a different label, thereby increasing the\nlikelihood of the model misclassifying the perturbed sample. The degree of\nfeature substitution is adjustable, allowing us to control how much of the\noriginal samples information is replaced. This flexibility effectively balances\na trade-off between the attacks effectiveness and its stealthiness. XSub is\nalso highly cost-effective in that the number of required queries to the\nprediction model and the explanation model in conducting the attack is in O(1).\nIn addition, XSub can be easily extended to launch backdoor attacks in case the\nattacker has access to the models training data. Our evaluation demonstrates\nthat XSub is not only effective and stealthy but also cost-effective, enabling\nits application across a wide range of AI models.\n","authors":["Kiana Vu","Phung Lai","Truc Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.08919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08917v1","updated":"2024-09-13T15:32:26Z","published":"2024-09-13T15:32:26Z","title":"Latent Space Score-based Diffusion Model for Probabilistic Multivariate\n Time Series Imputation","summary":" Accurate imputation is essential for the reliability and success of\ndownstream tasks. Recently, diffusion models have attracted great attention in\nthis field. However, these models neglect the latent distribution in a\nlower-dimensional space derived from the observed data, which limits the\ngenerative capacity of the diffusion model. Additionally, dealing with the\noriginal missing data without labels becomes particularly problematic. To\naddress these issues, we propose the Latent Space Score-Based Diffusion Model\n(LSSDM) for probabilistic multivariate time series imputation. Observed values\nare projected onto low-dimensional latent space and coarse values of the\nmissing data are reconstructed without knowing their ground truth values by\nthis unsupervised learning approach. Finally, the reconstructed values are fed\ninto a conditional diffusion model to obtain the precise imputed values of the\ntime series. In this way, LSSDM not only possesses the power to identify the\nlatent distribution but also seamlessly integrates the diffusion model to\nobtain the high-fidelity imputed values and assess the uncertainty of the\ndataset. Experimental results demonstrate that LSSDM achieves superior\nimputation performance while also providing a better explanation and\nuncertainty analysis of the imputation mechanism. The website of the code is\n\\textit{https://github.com/gorgen2020/LSSDM\\_imputation}.\n","authors":["Guojun Liang","Najmeh Abiri","Atiye Sadat Hashemi","Jens Lundström","Stefan Byttner","Prayag Tiwari"],"pdf_url":"https://arxiv.org/pdf/2409.08917v1.pdf","comment":"5 pages, conference"},{"id":"http://arxiv.org/abs/2409.08913v1","updated":"2024-09-13T15:29:37Z","published":"2024-09-13T15:29:37Z","title":"HLTCOE JHU Submission to the Voice Privacy Challenge 2024","summary":" We present a number of systems for the Voice Privacy Challenge, including\nvoice conversion based systems such as the kNN-VC method and the WavLM voice\nConversion method, and text-to-speech (TTS) based systems including\nWhisper-VITS. We found that while voice conversion systems better preserve\nemotional content, they struggle to conceal speaker identity in semi-white-box\nattack scenarios; conversely, TTS methods perform better at anonymization and\nworse at emotion preservation. Finally, we propose a random admixture system\nwhich seeks to balance out the strengths and weaknesses of the two category of\nsystems, achieving a strong EER of over 40% while maintaining UAR at a\nrespectable 47%.\n","authors":["Henry Li Xinyuan","Zexin Cai","Ashi Garg","Kevin Duh","Leibny Paola García-Perera","Sanjeev Khudanpur","Nicholas Andrews","Matthew Wiesner"],"pdf_url":"https://arxiv.org/pdf/2409.08913v1.pdf","comment":"Submission to the Voice Privacy Challenge 2024. Accepted and\n presented at"},{"id":"http://arxiv.org/abs/2409.08904v1","updated":"2024-09-13T15:15:45Z","published":"2024-09-13T15:15:45Z","title":"AnyBipe: An End-to-End Framework for Training and Deploying Bipedal\n Robots Guided by Large Language Models","summary":" Training and deploying reinforcement learning (RL) policies for robots,\nespecially in accomplishing specific tasks, presents substantial challenges.\nRecent advancements have explored diverse reward function designs, training\ntechniques, simulation-to-reality (sim-to-real) transfers, and performance\nanalysis methodologies, yet these still require significant human intervention.\nThis paper introduces an end-to-end framework for training and deploying RL\npolicies, guided by Large Language Models (LLMs), and evaluates its\neffectiveness on bipedal robots. The framework consists of three interconnected\nmodules: an LLM-guided reward function design module, an RL training module\nleveraging prior work, and a sim-to-real homomorphic evaluation module. This\ndesign significantly reduces the need for human input by utilizing only\nessential simulation and deployment platforms, with the option to incorporate\nhuman-engineered strategies and historical data. We detail the construction of\nthese modules, their advantages over traditional approaches, and demonstrate\nthe framework's capability to autonomously develop and refine controlling\nstrategies for bipedal robot locomotion, showcasing its potential to operate\nindependently of human intervention.\n","authors":["Yifei Yao","Wentao He","Chenyu Gu","Jiaheng Du","Fuwei Tan","Zhen Zhu","Junguo Lu"],"pdf_url":"https://arxiv.org/pdf/2409.08904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06724v2","updated":"2024-09-13T15:06:33Z","published":"2024-08-26T21:32:49Z","title":"MLP, XGBoost, KAN, TDNN, and LSTM-GRU Hybrid RNN with Attention for SPX\n and NDX European Call Option Pricing","summary":" We explore the performance of various artificial neural network\narchitectures, including a multilayer perceptron (MLP), Kolmogorov-Arnold\nnetwork (KAN), LSTM-GRU hybrid recursive neural network (RNN) models, and a\ntime-delay neural network (TDNN) for pricing European call options. In this\nstudy, we attempt to leverage the ability of supervised learning methods, such\nas ANNs, KANs, and gradient-boosted decision trees, to approximate complex\nmultivariate functions in order to calibrate option prices based on past market\ndata. The motivation for using ANNs and KANs is the Universal Approximation\nTheorem and Kolmogorov-Arnold Representation Theorem, respectively.\nSpecifically, we use S\\&P 500 (SPX) and NASDAQ 100 (NDX) index options traded\nduring 2015-2023 with times to maturity ranging from 15 days to over 4 years\n(OptionMetrics IvyDB US dataset). Black \\& Scholes's (BS) PDE \\cite{Black1973}\nmodel's performance in pricing the same options compared to real data is used\nas a benchmark. This model relies on strong assumptions, and it has been\nobserved and discussed in the literature that real data does not match its\npredictions. Supervised learning methods are widely used as an alternative for\ncalibrating option prices due to some of the limitations of this model. In our\nexperiments, the BS model underperforms compared to all of the others. Also,\nthe best TDNN model outperforms the best MLP model on all error metrics. We\nimplement a simple self-attention mechanism to enhance the RNN models,\nsignificantly improving their performance. The best-performing model overall is\nthe LSTM-GRU hybrid RNN model with attention. Also, the KAN model outperforms\nthe TDNN and MLP models. We analyze the performance of all models by ticker,\nmoneyness category, and over/under/correctly-priced percentage.\n","authors":["Boris Ter-Avanesov","Homayoon Beigi"],"pdf_url":"https://arxiv.org/pdf/2409.06724v2.pdf","comment":"78 pages, 39 figures"},{"id":"http://arxiv.org/abs/2404.01903v2","updated":"2024-09-13T14:56:46Z","published":"2024-04-02T12:44:44Z","title":"Understanding How CodeLLMs (Mis)Predict Types with Activation Steering","summary":" CodeLLMs are transforming software development as we know it. This is\nespecially true for tasks where rule-based approaches fall short, like type\nprediction. The type prediction task consists in adding a new type annotation\nto a partially typed program, such that the resulting program is closer to\nbeing fully typed. The intractability of rule-based approaches and high cost of\nmanual annotation make CodeLLMs an attractive solution to the problem. However,\nCodeLLMs are still far from being deployed on the large-scale due to doubts\nsurrounding their reliability.\n To shed some light on how CodeLLMs approach type prediction, we investigate\nwhat happens when a model mispredicts a type. We show that by applying\nsemantics-preserving edits to code, CodeLLMs are eventually misled into\nmispredicting type annotations. However, by leveraging activation steering we\nare able to \"steer\" the model back to the correct prediction, making models\nmore robust against semantically irrelevant prompt features. We show that\nsteering achieves comparable performance to fine-tuning directly on the type\nprediction task. Furthermore, we find that steering vectors computed from\nPython code are effective at correcting TypeScript mispredictions, and vice\nversa. To our knowledge, this is the first evidence of its kind to suggest that\nCodeLLMs learn task representations that transfer across languages.\n","authors":["Francesca Lucchetti","Arjun Guha"],"pdf_url":"https://arxiv.org/pdf/2404.01903v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.08884v1","updated":"2024-09-13T14:50:14Z","published":"2024-09-13T14:50:14Z","title":"Detect Fake with Fake: Leveraging Synthetic Data-driven Representation\n for Synthetic Image Detection","summary":" Are general-purpose visual representations acquired solely from synthetic\ndata useful for detecting fake images? In this work, we show the effectiveness\nof synthetic data-driven representations for synthetic image detection. Upon\nanalysis, we find that vision transformers trained by the latest visual\nrepresentation learners with synthetic data can effectively distinguish fake\nfrom real images without seeing any real images during pre-training. Notably,\nusing SynCLR as the backbone in a state-of-the-art detection method\ndemonstrates a performance improvement of +10.32 mAP and +4.73% accuracy over\nthe widely used CLIP, when tested on previously unseen GAN models. Code is\navailable at https://github.com/cvpaperchallenge/detect-fake-with-fake.\n","authors":["Hina Otake","Yoshihiro Fukuhara","Yoshiki Kubotani","Shigeo Morishima"],"pdf_url":"https://arxiv.org/pdf/2409.08884v1.pdf","comment":"Accepted to TWYN workshop at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.09158v2","updated":"2024-09-13T14:34:26Z","published":"2024-08-17T10:06:50Z","title":"Linear Attention is Enough in Spatial-Temporal Forecasting","summary":" As the most representative scenario of spatial-temporal forecasting tasks,\nthe traffic forecasting task attracted numerous attention from machine learning\ncommunity due to its intricate correlation both in space and time dimension.\nExisting methods often treat road networks over time as spatial-temporal\ngraphs, addressing spatial and temporal representations independently. However,\nthese approaches struggle to capture the dynamic topology of road networks,\nencounter issues with message passing mechanisms and over-smoothing, and face\nchallenges in learning spatial and temporal relationships separately. To\naddress these limitations, we propose treating nodes in road networks at\ndifferent time steps as independent spatial-temporal tokens and feeding them\ninto a vanilla Transformer to learn complex spatial-temporal patterns, design\n\\textbf{STformer} achieving SOTA. Given its quadratic complexity, we introduce\na variant \\textbf{NSTformer} based on Nystr$\\ddot{o}$m method to approximate\nself-attention with linear complexity but even slightly better than former in a\nfew cases astonishingly. Extensive experimental results on traffic datasets\ndemonstrate that the proposed method achieves state-of-the-art performance at\nan affordable computational cost. Our code is available at\n\\href{https://github.com/XinyuNing/STformer-and-NSTformer}{https://github.com/XinyuNing/STformer-and-NSTformer}.\n","authors":["Xinyu Ning"],"pdf_url":"https://arxiv.org/pdf/2408.09158v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07347v2","updated":"2024-09-13T14:32:10Z","published":"2024-09-11T15:31:40Z","title":"The Role of Explainable AI in Revolutionizing Human Health Monitoring","summary":" The complex nature of disease mechanisms and the variability of patient\nsymptoms present significant obstacles in developing effective diagnostic\ntools. Although machine learning has made considerable advances in medical\ndiagnosis, its decision-making processes frequently lack transparency, which\ncan jeopardize patient outcomes. This underscores the critical need for\nExplainable AI (XAI), which not only offers greater clarity but also has the\npotential to significantly improve patient care. In this literature review, we\nconduct a detailed analysis of analyzing XAI methods identified through\nsearches across various databases, focusing on chronic conditions such as\nParkinson's, stroke, depression, cancer, heart disease, and Alzheimer's\ndisease. The literature search revealed the application of 9 trending XAI\nalgorithms in the field of healthcare and highlighted the pros and cons of each\nof them. Thus, the article is concluded with a critical appraisal of the\nchallenges and future research opportunities for XAI in human health\nmonitoring.\n","authors":["Abdullah Alharthi","Ahmed Alqurashi","Turki Alharbi","Mohammed Alammar","Nasser Aldosari","Houssem Bouchekara","Yusuf Shaaban","Mohammad Shoaib Shahriar","Abdulrahman Al Ayidh"],"pdf_url":"https://arxiv.org/pdf/2409.07347v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08864v1","updated":"2024-09-13T14:26:58Z","published":"2024-09-13T14:26:58Z","title":"Exploring Graph Structure Comprehension Ability of Multimodal Large\n Language Models: Case Studies","summary":" Large Language Models (LLMs) have shown remarkable capabilities in processing\nvarious data structures, including graphs. While previous research has focused\non developing textual encoding methods for graph representation, the emergence\nof multimodal LLMs presents a new frontier for graph comprehension. These\nadvanced models, capable of processing both text and images, offer potential\nimprovements in graph understanding by incorporating visual representations\nalongside traditional textual data. This study investigates the impact of graph\nvisualisations on LLM performance across a range of benchmark tasks at node,\nedge, and graph levels. Our experiments compare the effectiveness of multimodal\napproaches against purely textual graph representations. The results provide\nvaluable insights into both the potential and limitations of leveraging visual\ngraph modalities to enhance LLMs' graph structure comprehension abilities.\n","authors":["Zhiqiang Zhong","Davide Mottin"],"pdf_url":"https://arxiv.org/pdf/2409.08864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08861v1","updated":"2024-09-13T14:22:14Z","published":"2024-09-13T14:22:14Z","title":"Adjoint Matching: Fine-tuning Flow and Diffusion Generative Models with\n Memoryless Stochastic Optimal Control","summary":" Dynamical generative models that produce samples through an iterative\nprocess, such as Flow Matching and denoising diffusion models, have seen\nwidespread use, but there has not been many theoretically-sound methods for\nimproving these models with reward fine-tuning. In this work, we cast reward\nfine-tuning as stochastic optimal control (SOC). Critically, we prove that a\nvery specific memoryless noise schedule must be enforced during fine-tuning, in\norder to account for the dependency between the noise variable and the\ngenerated samples. We also propose a new algorithm named Adjoint Matching which\noutperforms existing SOC algorithms, by casting SOC problems as a regression\nproblem. We find that our approach significantly improves over existing methods\nfor reward fine-tuning, achieving better consistency, realism, and\ngeneralization to unseen human preference reward models, while retaining sample\ndiversity.\n","authors":["Carles Domingo-Enrich","Michal Drozdzal","Brian Karrer","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2409.08861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08847v1","updated":"2024-09-13T14:05:26Z","published":"2024-09-13T14:05:26Z","title":"Kinect Calibration and Data Optimization For Anthropometric Parameters","summary":" Recently, through development of several 3d vision systems, widely used in\nvarious applications, medical and biometric fields. Microsoft kinect sensor\nhave been most of used camera among 3d vision systems. Microsoft kinect sensor\ncan obtain depth images of a scene and 3d coordinates of human joints. Thus,\nanthropometric features can extractable easily. Anthropometric feature and 3d\njoint coordinate raw datas which captured from kinect sensor is unstable. The\nstrongest reason for this, datas vary by distance between joints of individual\nand location of kinect sensor. Consequently, usage of this datas without kinect\ncalibration and data optimization does not result in sufficient and healthy. In\nthis study, proposed a novel method to calibrating kinect sensor and optimizing\nskeleton features. Results indicate that the proposed method is quite effective\nand worthy of further study in more general scenarios.\n","authors":["M. S. Gokmen","M. Akbaba","O. Findik"],"pdf_url":"https://arxiv.org/pdf/2409.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08846v1","updated":"2024-09-13T14:04:39Z","published":"2024-09-13T14:04:39Z","title":"FP-VEC: Fingerprinting Large Language Models via Efficient Vector\n Addition","summary":" Training Large Language Models (LLMs) requires immense computational power\nand vast amounts of data. As a result, protecting the intellectual property of\nthese models through fingerprinting is essential for ownership authentication.\nWhile adding fingerprints to LLMs through fine-tuning has been attempted, it\nremains costly and unscalable. In this paper, we introduce FP-VEC, a pilot\nstudy on using fingerprint vectors as an efficient fingerprinting method for\nLLMs. Our approach generates a fingerprint vector that represents a\nconfidential signature embedded in the model, allowing the same fingerprint to\nbe seamlessly incorporated into an unlimited number of LLMs via vector\naddition. Results on several LLMs show that FP-VEC is lightweight by running on\nCPU-only devices for fingerprinting, scalable with a single training and\nunlimited fingerprinting process, and preserves the model's normal behavior.\nThe project page is available at https://fingerprintvector.github.io .\n","authors":["Zhenhua Xu","Wenpeng Xing","Zhebo Wang","Chang Hu","Chen Jie","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2409.08846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08839v1","updated":"2024-09-13T13:53:41Z","published":"2024-09-13T13:53:41Z","title":"RF Challenge: The Data-Driven Radio Frequency Signal Separation\n Challenge","summary":" This paper addresses the critical problem of interference rejection in\nradio-frequency (RF) signals using a novel, data-driven approach that leverages\nstate-of-the-art AI models. Traditionally, interference rejection algorithms\nare manually tailored to specific types of interference. This work introduces a\nmore scalable data-driven solution and contains the following contributions.\nFirst, we present an insightful signal model that serves as a foundation for\ndeveloping and analyzing interference rejection algorithms. Second, we\nintroduce the RF Challenge, a publicly available dataset featuring diverse RF\nsignals along with code templates, which facilitates data-driven analysis of RF\nsignal problems. Third, we propose novel AI-based rejection algorithms,\nspecifically architectures like UNet and WaveNet, and evaluate their\nperformance across eight different signal mixture types. These models\ndemonstrate superior performance exceeding traditional methods like matched\nfiltering and linear minimum mean square error estimation by up to two orders\nof magnitude in bit-error rate. Fourth, we summarize the results from an open\ncompetition hosted at 2024 IEEE International Conference on Acoustics, Speech,\nand Signal Processing (ICASSP 2024) based on the RF Challenge, highlighting the\nsignificant potential for continued advancements in this area. Our findings\nunderscore the promise of deep learning algorithms in mitigating interference,\noffering a strong foundation for future research.\n","authors":["Alejandro Lancho","Amir Weiss","Gary C. F. Lee","Tejas Jayashankar","Binoy Kurien","Yury Polyanskiy","Gregory W. Wornell"],"pdf_url":"https://arxiv.org/pdf/2409.08839v1.pdf","comment":"14 pages, 12 figures, submitted to the IEEE Open Journal of the\n Communications Society"},{"id":"http://arxiv.org/abs/2409.08832v1","updated":"2024-09-13T13:48:06Z","published":"2024-09-13T13:48:06Z","title":"Can Kans (re)discover predictive models for Direct-Drive Laser Fusion?","summary":" The domain of laser fusion presents a unique and challenging predictive\nmodeling application landscape for machine learning methods due to high problem\ncomplexity and limited training data. Data-driven approaches utilizing\nprescribed functional forms, inductive biases and physics-informed learning\n(PIL) schemes have been successful in the past for achieving desired\ngeneralization ability and model interpretation that aligns with physics\nexpectations. In complex multi-physics application domains, however, it is not\nalways obvious how architectural biases or discriminative penalties can be\nformulated. In this work, focusing on nuclear fusion energy using high powered\nlasers, we present the use of Kolmogorov-Arnold Networks (KANs) as an\nalternative to PIL for developing a new type of data-driven predictive model\nwhich is able to achieve high prediction accuracy and physics interpretability.\nA KAN based model, a MLP with PIL, and a baseline MLP model are compared in\ngeneralization ability and interpretation with a domain expert-derived symbolic\nregression model. Through empirical studies in this high physics complexity\ndomain, we show that KANs can potentially provide benefits when developing\npredictive models for data-starved physics applications.\n","authors":["Rahman Ejaz","Varchas Gopalaswamy","Riccardo Betti","Aarne Lees","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2409.08832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14483v2","updated":"2024-09-13T13:38:24Z","published":"2024-01-25T19:36:11Z","title":"Four Facets of Forecast Felicity: Calibration, Predictiveness,\n Randomness and Regret","summary":" Machine learning is about forecasting. Forecasts, however, obtain their\nusefulness only through their evaluation. Machine learning has traditionally\nfocused on types of losses and their corresponding regret. Currently, the\nmachine learning community regained interest in calibration. In this work, we\nshow the conceptual equivalence of calibration and regret in evaluating\nforecasts. We frame the evaluation problem as a game between a forecaster, a\ngambler and nature. Putting intuitive restrictions on gambler and forecaster,\ncalibration and regret naturally fall out of the framework. In addition, this\ngame links evaluation of forecasts to randomness of outcomes. Random outcomes\nwith respect to forecasts are equivalent to good forecasts with respect to\noutcomes. We call those dual aspects, calibration and regret, predictiveness\nand randomness, the four facets of forecast felicity.\n","authors":["Rabanus Derr","Robert C. Williamson"],"pdf_url":"https://arxiv.org/pdf/2401.14483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08823v1","updated":"2024-09-13T13:36:51Z","published":"2024-09-13T13:36:51Z","title":"AutoIRT: Calibrating Item Response Theory Models with Automated Machine\n Learning","summary":" Item response theory (IRT) is a class of interpretable factor models that are\nwidely used in computerized adaptive tests (CATs), such as language proficiency\ntests. Traditionally, these are fit using parametric mixed effects models on\nthe probability of a test taker getting the correct answer to a test item\n(i.e., question). Neural net extensions of these models, such as BertIRT,\nrequire specialized architectures and parameter tuning. We propose a multistage\nfitting procedure that is compatible with out-of-the-box Automated Machine\nLearning (AutoML) tools. It is based on a Monte Carlo EM (MCEM) outer loop with\na two stage inner loop, which trains a non-parametric AutoML grade model using\nitem features followed by an item specific parametric model. This greatly\naccelerates the modeling workflow for scoring tests. We demonstrate its\neffectiveness by applying it to the Duolingo English Test, a high stakes,\nonline English proficiency test. We show that the resulting model is typically\nmore well calibrated, gets better predictive performance, and more accurate\nscores than existing methods (non-explanatory IRT models and explanatory IRT\nmodels like BERT-IRT). Along the way, we provide a brief survey of machine\nlearning methods for calibration of item parameters for CATs.\n","authors":["James Sharpnack","Phoebe Mulcaire","Klinton Bicknell","Geoff LaFlair","Kevin Yancey"],"pdf_url":"https://arxiv.org/pdf/2409.08823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15654v3","updated":"2024-09-13T13:28:31Z","published":"2023-11-27T09:33:56Z","title":"Event Detection in Time Series: Universal Deep Learning Approach","summary":" Event detection in time series is a challenging task due to the prevalence of\nimbalanced datasets, rare events, and time interval-defined events. Traditional\nsupervised deep learning methods primarily employ binary classification, where\neach time step is assigned a binary label indicating the presence or absence of\nan event. However, these methods struggle to handle these specific scenarios\neffectively. To address these limitations, we propose a novel supervised\nregression-based deep learning approach that offers several advantages over\nclassification-based methods. Our approach, with a limited number of\nparameters, can effectively handle various types of events within a unified\nframework, including rare events and imbalanced datasets. We provide\ntheoretical justifications for its universality and precision and demonstrate\nits superior performance across diverse domains, particularly for rare events\nand imbalanced datasets.\n","authors":["Menouar Azib","Benjamin Renard","Philippe Garnier","Vincent Génot","Nicolas André"],"pdf_url":"https://arxiv.org/pdf/2311.15654v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13794v2","updated":"2024-09-13T13:28:04Z","published":"2024-02-21T13:24:14Z","title":"Revisiting Convergence of AdaGrad with Relaxed Assumptions","summary":" In this study, we revisit the convergence of AdaGrad with momentum (covering\nAdaGrad as a special case) on non-convex smooth optimization problems. We\nconsider a general noise model where the noise magnitude is controlled by the\nfunction value gap together with the gradient magnitude. This model encompasses\na broad range of noises including bounded noise, sub-Gaussian noise, affine\nvariance noise and the expected smoothness, and it has been shown to be more\nrealistic in many practical applications. Our analysis yields a probabilistic\nconvergence rate which, under the general noise, could reach at\n(\\tilde{\\mathcal{O}}(1/\\sqrt{T})). This rate does not rely on prior knowledge\nof problem-parameters and could accelerate to (\\tilde{\\mathcal{O}}(1/T)) where\n(T) denotes the total number iterations, when the noise parameters related to\nthe function value gap and noise level are sufficiently small. The convergence\nrate thus matches the lower rate for stochastic first-order methods over\nnon-convex smooth landscape up to logarithm terms [Arjevani et al., 2023]. We\nfurther derive a convergence bound for AdaGrad with mometum, considering the\ngeneralized smoothness where the local smoothness is controlled by a\nfirst-order function of the gradient norm.\n","authors":["Yusu Hong","Junhong Lin"],"pdf_url":"https://arxiv.org/pdf/2402.13794v2.pdf","comment":"Accepted by UAI 2024"},{"id":"http://arxiv.org/abs/2409.08806v1","updated":"2024-09-13T13:14:54Z","published":"2024-09-13T13:14:54Z","title":"TabKANet: Tabular Data Modelling with Kolmogorov-Arnold Network and\n Transformer","summary":" Tabular data is the most common type of data in real-life scenarios. In this\nstudy, we propose a method based on the TabKANet architecture, which utilizes\nthe Kolmogorov-Arnold network to encode numerical features and merge them with\ncategorical features, enabling unified modeling of tabular data on the\nTransformer architecture. This model demonstrates outstanding performance in\nsix widely used binary classification tasks, suggesting that TabKANet has the\npotential to become a standard approach for tabular modeling, surpassing\ntraditional neural networks. Furthermore, this research reveals the significant\nadvantages of the Kolmogorov-Arnold network in encoding numerical features. The\ncode of our work is available at https://github.com/tsinghuamedgao20/TabKANet.\n","authors":["Weihao Gao","Zheng Gong","Zhuo Deng","Fuju Rong","Chucheng Chen","Lan Ma"],"pdf_url":"https://arxiv.org/pdf/2409.08806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08788v1","updated":"2024-09-13T12:50:36Z","published":"2024-09-13T12:50:36Z","title":"Electrocardiogram Report Generation and Question Answering via\n Retrieval-Augmented Self-Supervised Modeling","summary":" Interpreting electrocardiograms (ECGs) and generating comprehensive reports\nremain challenging tasks in cardiology, often requiring specialized expertise\nand significant time investment. To address these critical issues, we propose\nECG-ReGen, a retrieval-based approach for ECG-to-text report generation and\nquestion answering. Our method leverages a self-supervised learning for the ECG\nencoder, enabling efficient similarity searches and report retrieval. By\ncombining pre-training with dynamic retrieval and Large Language Model\n(LLM)-based refinement, ECG-ReGen effectively analyzes ECG data and answers\nrelated queries, with the potential of improving patient care. Experiments\nconducted on the PTB-XL and MIMIC-IV-ECG datasets demonstrate superior\nperformance in both in-domain and cross-domain scenarios for report generation.\nFurthermore, our approach exhibits competitive performance on ECG-QA dataset\ncompared to fully supervised methods when utilizing off-the-shelf LLMs for\nzero-shot question answering. This approach, effectively combining\nself-supervised encoder and LLMs, offers a scalable and efficient solution for\naccurate ECG interpretation, holding significant potential to enhance clinical\ndecision-making.\n","authors":["Jialu Tang","Tong Xia","Yuan Lu","Cecilia Mascolo","Aaqib Saeed"],"pdf_url":"https://arxiv.org/pdf/2409.08788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08786v1","updated":"2024-09-13T12:45:30Z","published":"2024-09-13T12:45:30Z","title":"Deep Learning-based Codes for Wiretap Fading Channels","summary":" The wiretap channel is a well-studied problem in the physical layer security\n(PLS) literature. Although it is proven that the decoding error probability and\ninformation leakage can be made arbitrarily small in the asymptotic regime,\nfurther research on finite-blocklength codes is required on the path towards\npractical, secure communications systems. This work provides the first\nexperimental characterization of a deep learning-based, finite-blocklength code\nconstruction for multi-tap fading wiretap channels without channel state\ninformation (CSI). In addition to the evaluation of the average probability of\nerror and information leakage, we illustrate the influence of (i) the number of\nfading taps, (ii) differing variances of the fading coefficients and (iii) the\nseed selection for the hash function-based security layer.\n","authors":["Daniel Seifert","Onur Günlü","Rafael F. Schaefer"],"pdf_url":"https://arxiv.org/pdf/2409.08786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09895v4","updated":"2024-09-13T12:28:45Z","published":"2024-08-19T11:09:12Z","title":"Performance Law of Large Language Models","summary":" Guided by the belief of the scaling law, large language models (LLMs) have\nachieved impressive performance in recent years. However, scaling law only\ngives a qualitative estimation of loss, which is influenced by various factors\nsuch as model architectures, data distributions, tokenizers, and computation\nprecision. Thus, estimating the real performance of LLMs with different\ntraining settings rather than loss may be quite useful in practical\ndevelopment. In this article, we present an empirical equation named\n\"Performance Law\" to directly predict the MMLU score of an LLM, which is a\nwidely used metric to indicate the general capability of LLMs in real-world\nconversations and applications. Based on only a few key hyperparameters of the\nLLM architecture and the size of training data, we obtain a quite accurate MMLU\nprediction of various LLMs with diverse sizes and architectures developed by\ndifferent organizations in different years. Performance law can be used to\nguide the choice of LLM architecture and the effective allocation of\ncomputational resources without extensive experiments.\n","authors":["Chuhan Wu","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2408.09895v4.pdf","comment":"Personal opinions of the authors"},{"id":"http://arxiv.org/abs/2409.08771v1","updated":"2024-09-13T12:28:42Z","published":"2024-09-13T12:28:42Z","title":"In-depth Analysis of Low-rank Matrix Factorisation in a Federated\n Setting","summary":" We analyze a distributed algorithm to compute a low-rank matrix factorization\non $N$ clients, each holding a local dataset $\\mathbf{S}^i \\in \\mathbb{R}^{n_i\n\\times d}$, mathematically, we seek to solve $min_{\\mathbf{U}^i \\in\n\\mathbb{R}^{n_i\\times r}, \\mathbf{V}\\in \\mathbb{R}^{d \\times r} } \\frac{1}{2}\n\\sum_{i=1}^N \\|\\mathbf{S}^i - \\mathbf{U}^i \\mathbf{V}^\\top\\|^2_{\\text{F}}$.\nConsidering a power initialization of $\\mathbf{V}$, we rewrite the previous\nsmooth non-convex problem into a smooth strongly-convex problem that we solve\nusing a parallel Nesterov gradient descent potentially requiring a single step\nof communication at the initialization step. For any client $i$ in $\\{1, \\dots,\nN\\}$, we obtain a global $\\mathbf{V}$ in $\\mathbb{R}^{d \\times r}$ common to\nall clients and a local variable $\\mathbf{U}^i$ in $\\mathbb{R}^{n_i \\times r}$.\nWe provide a linear rate of convergence of the excess loss which depends on\n$\\sigma_{\\max} / \\sigma_{r}$, where $\\sigma_{r}$ is the $r^{\\mathrm{th}}$\nsingular value of the concatenation $\\mathbf{S}$ of the matrices\n$(\\mathbf{S}^i)_{i=1}^N$. This result improves the rates of convergence given\nin the literature, which depend on $\\sigma_{\\max}^2 / \\sigma_{\\min}^2$. We\nprovide an upper bound on the Frobenius-norm error of reconstruction under the\npower initialization strategy. We complete our analysis with experiments on\nboth synthetic and real data.\n","authors":["Constantin Philippenko","Kevin Scaman","Laurent Massoulié"],"pdf_url":"https://arxiv.org/pdf/2409.08771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16318v2","updated":"2024-09-13T12:27:44Z","published":"2024-01-29T17:21:41Z","title":"Defining and Extracting generalizable interaction primitives from DNNs","summary":" Faithfully summarizing the knowledge encoded by a deep neural network (DNN)\ninto a few symbolic primitive patterns without losing much information\nrepresents a core challenge in explainable AI. To this end, Ren et al. (2024)\nhave derived a series of theorems to prove that the inference score of a DNN\ncan be explained as a small set of interactions between input variables.\nHowever, the lack of generalization power makes it still hard to consider such\ninteractions as faithful primitive patterns encoded by the DNN. Therefore,\ngiven different DNNs trained for the same task, we develop a new method to\nextract interactions that are shared by these DNNs. Experiments show that the\nextracted interactions can better reflect common knowledge shared by different\nDNNs.\n","authors":["Lu Chen","Siyu Lou","Benhao Huang","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.16318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17323v2","updated":"2024-09-13T12:25:58Z","published":"2024-06-25T07:14:15Z","title":"XAMI -- A Benchmark Dataset for Artefact Detection in XMM-Newton Optical\n Images","summary":" Reflected or scattered light produce artefacts in astronomical observations\nthat can negatively impact the scientific study. Hence, automated detection of\nthese artefacts is highly beneficial, especially with the increasing amounts of\ndata gathered. Machine learning methods are well-suited to this problem, but\ncurrently there is a lack of annotated data to train such approaches to detect\nartefacts in astronomical observations. In this work, we present a dataset of\nimages from the XMM-Newton space telescope Optical Monitoring camera showing\ndifferent types of artefacts. We hand-annotated a sample of 1000 images with\nartefacts which we use to train automated ML methods. We further demonstrate\ntechniques tailored for accurate detection and masking of artefacts using\ninstance segmentation. We adopt a hybrid approach, combining knowledge from\nboth convolutional neural networks (CNNs) and transformer-based models and use\ntheir advantages in segmentation. The presented method and dataset will advance\nartefact detection in astronomical observations by providing a reproducible\nbaseline. All code and data are made available\n(https://github.com/ESA-Datalabs/XAMI-model and\nhttps://github.com/ESA-Datalabs/XAMI-dataset).\n","authors":["Elisabeta-Iulia Dima","Pablo Gómez","Sandor Kruk","Peter Kretschmar","Simon Rosen","Călin-Adrian Popa"],"pdf_url":"https://arxiv.org/pdf/2406.17323v2.pdf","comment":"Accepted for oral presentation at SPAICE 2024"},{"id":"http://arxiv.org/abs/2409.08770v1","updated":"2024-09-13T12:24:12Z","published":"2024-09-13T12:24:12Z","title":"Increasing Both Batch Size and Learning Rate Accelerates Stochastic\n Gradient Descent","summary":" The performance of mini-batch stochastic gradient descent (SGD) strongly\ndepends on setting the batch size and learning rate to minimize the empirical\nloss in training the deep neural network. In this paper, we present theoretical\nanalyses of mini-batch SGD with four schedulers: (i) constant batch size and\ndecaying learning rate scheduler, (ii) increasing batch size and decaying\nlearning rate scheduler, (iii) increasing batch size and increasing learning\nrate scheduler, and (iv) increasing batch size and warm-up decaying learning\nrate scheduler. We show that mini-batch SGD using scheduler (i) does not always\nminimize the expectation of the full gradient norm of the empirical loss,\nwhereas it does using any of schedulers (ii), (iii), and (iv). Furthermore,\nschedulers (iii) and (iv) accelerate mini-batch SGD. The paper also provides\nnumerical results of supporting analyses showing that using scheduler (iii) or\n(iv) minimizes the full gradient norm of the empirical loss faster than using\nscheduler (i) or (ii).\n","authors":["Hikaru Umeda","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2409.08770v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.08768v1","updated":"2024-09-13T12:20:41Z","published":"2024-09-13T12:20:41Z","title":"Measure-Theoretic Time-Delay Embedding","summary":" The celebrated Takens' embedding theorem provides a theoretical foundation\nfor reconstructing the full state of a dynamical system from partial\nobservations. However, the classical theorem assumes that the underlying system\nis deterministic and that observations are noise-free, limiting its\napplicability in real-world scenarios. Motivated by these limitations, we\nrigorously establish a measure-theoretic generalization that adopts an Eulerian\ndescription of the dynamics and recasts the embedding as a pushforward map\nbetween probability spaces. Our mathematical results leverage recent advances\nin optimal transportation theory. Building on our novel measure-theoretic\ntime-delay embedding theory, we have developed a new computational framework\nthat forecasts the full state of a dynamical system from time-lagged partial\nobservations, engineered with better robustness to handle sparse and noisy\ndata. We showcase the efficacy and versatility of our approach through several\nnumerical examples, ranging from the classic Lorenz-63 system to large-scale,\nreal-world applications such as NOAA sea surface temperature forecasting and\nERA5 wind field reconstruction.\n","authors":["Jonah Botvinick-Greenhouse","Maria Oprea","Romit Maulik","Yunan Yang"],"pdf_url":"https://arxiv.org/pdf/2409.08768v1.pdf","comment":"32 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.08766v1","updated":"2024-09-13T12:20:02Z","published":"2024-09-13T12:20:02Z","title":"SAUC: Sparsity-Aware Uncertainty Calibration for Spatiotemporal\n Prediction with Graph Neural Networks","summary":" Quantifying uncertainty is crucial for robust and reliable predictions.\nHowever, existing spatiotemporal deep learning mostly focuses on deterministic\nprediction, overlooking the inherent uncertainty in such prediction.\nParticularly, highly-granular spatiotemporal datasets are often sparse, posing\nextra challenges in prediction and uncertainty quantification. To address these\nissues, this paper introduces a novel post-hoc Sparsity-awar Uncertainty\nCalibration (SAUC) framework, which calibrates uncertainty in both zero and\nnon-zero values. To develop SAUC, we firstly modify the state-of-the-art\ndeterministic spatiotemporal Graph Neural Networks (ST-GNNs) to probabilistic\nones in the pre-calibration phase. Then we calibrate the probabilistic ST-GNNs\nfor zero and non-zero values using quantile approaches.Through extensive\nexperiments, we demonstrate that SAUC can effectively fit the variance of\nsparse data and generalize across two real-world spatiotemporal datasets at\nvarious granularities. Specifically, our empirical experiments show a 20\\%\nreduction in calibration errors in zero entries on the sparse traffic accident\nand urban crime prediction. Overall, this work demonstrates the theoretical and\nempirical values of the SAUC framework, thus bridging a significant gap between\nuncertainty quantification and spatiotemporal prediction.\n","authors":["Dingyi Zhuang","Yuheng Bu","Guang Wang","Shenhao Wang","Jinhua Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.08766v1.pdf","comment":"Paper accepted by ACM SIGSPATIAL 2024"},{"id":"http://arxiv.org/abs/2306.11474v2","updated":"2024-09-13T12:17:14Z","published":"2023-06-20T11:54:36Z","title":"A Passivity-Based Method for Accelerated Convex Optimisation","summary":" This study presents a constructive methodology for designing accelerated\nconvex optimisation algorithms in continuous-time domain. The two key enablers\nare the classical concept of passivity in control theory and the time-dependent\nchange of variables that maps the output of the internal dynamic system to the\noptimisation variables. The Lyapunov function associated with the optimisation\ndynamics is obtained as a natural consequence of specifying the internal\ndynamics that drives the state evolution as a passive linear time-invariant\nsystem. The passivity-based methodology provides a general framework that has\nthe flexibility to generate convex optimisation algorithms with the guarantee\nof different convergence rate bounds on the objective function value. The same\nprinciple applies to the design of online parameter update algorithms for\nadaptive control by re-defining the output of internal dynamics to allow for\nthe feedback interconnection with tracking error dynamics.\n","authors":["Namhoon Cho","Hyo-Sang Shin"],"pdf_url":"https://arxiv.org/pdf/2306.11474v2.pdf","comment":"10 pages, 1 figure, accepted for presentation at 2024 IEEE CDC"},{"id":"http://arxiv.org/abs/2409.08763v1","updated":"2024-09-13T12:11:42Z","published":"2024-09-13T12:11:42Z","title":"Energy Consumption Trends in Sound Event Detection Systems","summary":" Deep learning systems have become increasingly energy- and\ncomputation-intensive, raising concerns about their environmental impact. As\norganizers of the Detection and Classification of Acoustic Scenes and Events\n(DCASE) challenge, we recognize the importance of addressing this issue. For\nthe past three years, we have integrated energy consumption metrics into the\nevaluation of sound event detection (SED) systems. In this paper, we analyze\nthe impact of this energy criterion on the challenge results and explore the\nevolution of system complexity and energy consumption over the years. We\nhighlight a shift towards more energy-efficient approaches during training\nwithout compromising performance, while the number of operations and system\ncomplexity continue to grow. Through this analysis, we hope to promote more\nenvironmentally friendly practices within the SED community.\n","authors":["Constance Douwes","Romain Serizel"],"pdf_url":"https://arxiv.org/pdf/2409.08763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05368v2","updated":"2024-09-13T12:10:20Z","published":"2024-03-08T14:59:15Z","title":"Exploring the Links between the Fundamental Lemma and Kernel Regression","summary":" Generalizations and variations of the fundamental lemma by Willems et al. are\nan active topic of recent research. In this note, we explore and formalize the\nlinks between kernel regression and some known nonlinear extensions of the\nfundamental lemma. Applying a transformation to the usual linear equation in\nHankel matrices, we arrive at an alternative implicit kernel representation of\nthe system trajectories while keeping the requirements on persistency of\nexcitation. We show that this representation is equivalent to the solution of a\nspecific kernel regression problem. We explore the possible structures of the\nunderlying kernel as well as the system classes to which they correspond.\n","authors":["Oleksii Molodchyk","Timm Faulwasser"],"pdf_url":"https://arxiv.org/pdf/2403.05368v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2409.08760v1","updated":"2024-09-13T12:09:09Z","published":"2024-09-13T12:09:09Z","title":"Online Network Inference from Graph-Stationary Signals with Hidden Nodes","summary":" Graph learning is the fundamental task of estimating unknown graph\nconnectivity from available data. Typical approaches assume that not only is\nall information available simultaneously but also that all nodes can be\nobserved. However, in many real-world scenarios, data can neither be known\ncompletely nor obtained all at once. We present a novel method for online graph\nestimation that accounts for the presence of hidden nodes. We consider signals\nthat are stationary on the underlying graph, which provides a model for the\nunknown connections to hidden nodes. We then formulate a convex optimization\nproblem for graph learning from streaming, incomplete graph signals. We solve\nthe proposed problem through an efficient proximal gradient algorithm that can\nrun in real-time as data arrives sequentially. Additionally, we provide\ntheoretical conditions under which our online algorithm is similar to\nbatch-wise solutions. Through experimental results on synthetic and real-world\ndata, we demonstrate the viability of our approach for online graph learning in\nthe presence of missing observations.\n","authors":["Andrei Buciulea","Madeline Navarro","Samuel Rey","Santiago Segarra","Antonio G. Marques"],"pdf_url":"https://arxiv.org/pdf/2409.08760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08751v2","updated":"2024-09-13T12:06:13Z","published":"2023-09-15T20:27:47Z","title":"Diverse Neural Audio Embeddings -- Bringing Features back !","summary":" With the advent of modern AI architectures, a shift has happened towards\nend-to-end architectures. This pivot has led to neural architectures being\ntrained without domain-specific biases/knowledge, optimized according to the\ntask. We in this paper, learn audio embeddings via diverse feature\nrepresentations, in this case, domain-specific. For the case of audio\nclassification over hundreds of categories of sound, we learn robust separate\nembeddings for diverse audio properties such as pitch, timbre, and neural\nrepresentation, along with also learning it via an end-to-end architecture. We\nobserve handcrafted embeddings, e.g., pitch and timbre-based, although on their\nown, are not able to beat a fully end-to-end representation, yet adding these\ntogether with end-to-end embedding helps us, significantly improve performance.\nThis work would pave the way to bring some domain expertise with end-to-end\nmodels to learn robust, diverse representations, surpassing the performance of\njust training end-to-end models.\n","authors":["Prateek Verma"],"pdf_url":"https://arxiv.org/pdf/2309.08751v2.pdf","comment":"6 pages, 1 figure, 2 table, Under Review for 50th IEEE ICASSP 2025,\n Hyderabad, India"},{"id":"http://arxiv.org/abs/2409.08754v1","updated":"2024-09-13T12:04:45Z","published":"2024-09-13T12:04:45Z","title":"Uncertainty Estimation by Density Aware Evidential Deep Learning","summary":" Evidential deep learning (EDL) has shown remarkable success in uncertainty\nestimation. However, there is still room for improvement, particularly in\nout-of-distribution (OOD) detection and classification tasks. The limited OOD\ndetection performance of EDL arises from its inability to reflect the distance\nbetween the testing example and training data when quantifying uncertainty,\nwhile its limited classification performance stems from its parameterization of\nthe concentration parameters. To address these limitations, we propose a novel\nmethod called Density Aware Evidential Deep Learning (DAEDL). DAEDL integrates\nthe feature space density of the testing example with the output of EDL during\nthe prediction stage, while using a novel parameterization that resolves the\nissues in the conventional parameterization. We prove that DAEDL enjoys a\nnumber of favorable theoretical properties. DAEDL demonstrates state-of-the-art\nperformance across diverse downstream tasks related to uncertainty estimation\nand classification\n","authors":["Taeseong Yoon","Heeyoung Kim"],"pdf_url":"https://arxiv.org/pdf/2409.08754v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2409.08752v1","updated":"2024-09-13T12:03:23Z","published":"2024-09-13T12:03:23Z","title":"A Hybrid Meta-Learning and Multi-Armed Bandit Approach for\n Context-Specific Multi-Objective Recommendation Optimization","summary":" Recommender systems in online marketplaces face the challenge of balancing\nmultiple objectives to satisfy various stakeholders, including customers,\nproviders, and the platform itself. This paper introduces Juggler-MAB, a hybrid\napproach that combines meta-learning with Multi-Armed Bandits (MAB) to address\nthe limitations of existing multi-stakeholder recommendation systems. Our\nmethod extends the Juggler framework, which uses meta-learning to predict\noptimal weights for utility and compensation adjustments, by incorporating a\nMAB component for real-time, context-specific refinements. We present a\ntwo-stage approach where Juggler provides initial weight predictions, followed\nby MAB-based adjustments that adapt to rapid changes in user behavior and\nmarket conditions. Our system leverages contextual features such as device type\nand brand to make fine-grained weight adjustments based on specific segments.\nTo evaluate our approach, we developed a simulation framework using a dataset\nof 0.6 million searches from Expedia's lodging booking platform. Results show\nthat Juggler-MAB outperforms the original Juggler model across all metrics,\nwith NDCG improvements of 2.9%, a 13.7% reduction in regret, and a 9.8%\nimprovement in best arm selection rate.\n","authors":["Tiago Cunha","Andrea Marchini"],"pdf_url":"https://arxiv.org/pdf/2409.08752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07272v2","updated":"2024-09-13T12:03:06Z","published":"2024-09-11T13:46:52Z","title":"RePlay: a Recommendation Framework for Experimentation and Production\n Use","summary":" Using a single tool to build and compare recommender systems significantly\nreduces the time to market for new models. In addition, the comparison results\nwhen using such tools look more consistent. This is why many different tools\nand libraries for researchers in the field of recommendations have recently\nappeared. Unfortunately, most of these frameworks are aimed primarily at\nresearchers and require modification for use in production due to the inability\nto work on large datasets or an inappropriate architecture. In this demo, we\npresent our open-source toolkit RePlay - a framework containing an end-to-end\npipeline for building recommender systems, which is ready for production use.\nRePlay also allows you to use a suitable stack for the pipeline on each stage:\nPandas, Polars, or Spark. This allows the library to scale computations and\ndeploy to a cluster. Thus, RePlay allows data scientists to easily move from\nresearch mode to production mode using the same interfaces.\n","authors":["Alexey Vasilev","Anna Volodkevich","Denis Kulandin","Tatiana Bysheva","Anton Klenitskiy"],"pdf_url":"https://arxiv.org/pdf/2409.07272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08744v1","updated":"2024-09-13T11:52:16Z","published":"2024-09-13T11:52:16Z","title":"Uncertainty and Generalizability in Foundation Models for Earth\n Observation","summary":" We take the perspective in which we want to design a downstream task (such as\nestimating vegetation coverage) on a certain area of interest (AOI) with a\nlimited labeling budget. By leveraging an existing Foundation Model (FM) we\nmust decide whether we train a downstream model on a different but label-rich\nAOI hoping it generalizes to our AOI, or we split labels in our AOI for\ntraining and validating. In either case, we face choices concerning what FM to\nuse, how to sample our AOI for labeling, etc. which affect both the performance\nand uncertainty of the results. In this work, we perform a large ablative study\nusing eight existing FMs on either Sentinel 1 or Sentinel 2 as input data, and\nthe classes from the ESA World Cover product as downstream tasks across eleven\nAOIs. We do repeated sampling and training, resulting in an ablation of some\n500K simple linear regression models. Our results show both the limits of\nspatial generalizability across AOIs and the power of FMs where we are able to\nget over 0.9 correlation coefficient between predictions and targets on\ndifferent chip level predictive tasks. And still, performance and uncertainty\nvary greatly across AOIs, tasks and FMs. We believe this is a key issue in\npractice, because there are many design decisions behind each FM and downstream\ntask (input modalities, sampling, architectures, pretraining, etc.) and usually\na downstream task designer is aware of and can decide upon a few of them.\nThrough this work, we advocate for the usage of the methodology herein\ndescribed (large ablations on reference global labels and simple probes), both\nwhen publishing new FMs, and to make informed decisions when designing\ndownstream tasks to use them.\n","authors":["Raul Ramos-Pollan","Freddie Kalaitzis","Karthick Panner Selvam"],"pdf_url":"https://arxiv.org/pdf/2409.08744v1.pdf","comment":"A large ablation study measuring uncertainty and spatial\n generalizability with 8 foundation models, 11 world regions and 7 downstream\n tasks"},{"id":"http://arxiv.org/abs/2409.08741v1","updated":"2024-09-13T11:50:09Z","published":"2024-09-13T11:50:09Z","title":"Adaptive Sampling for Continuous Group Equivariant Neural Networks","summary":" Steerable networks, which process data with intrinsic symmetries, often use\nFourier-based nonlinearities that require sampling from the entire group,\nleading to a need for discretization in continuous groups. As the number of\nsamples increases, both performance and equivariance improve, yet this also\nleads to higher computational costs. To address this, we introduce an adaptive\nsampling approach that dynamically adjusts the sampling process to the\nsymmetries in the data, reducing the number of required group samples and\nlowering the computational demands. We explore various implementations and\ntheir effects on model performance, equivariance, and computational efficiency.\nOur findings demonstrate improved model performance, and a marginal increase in\nmemory efficiency.\n","authors":["Berfin Inal","Gabriele Cesa"],"pdf_url":"https://arxiv.org/pdf/2409.08741v1.pdf","comment":"9 pages, published in the Geometry-grounded Representation Learning\n and Generative Modeling (GRaM) Workshop at ICML 2024"},{"id":"http://arxiv.org/abs/2409.08733v1","updated":"2024-09-13T11:34:28Z","published":"2024-09-13T11:34:28Z","title":"Multi-intent Aware Contrastive Learning for Sequential Recommendation","summary":" Intent is a significant latent factor influencing user-item interaction\nsequences. Prevalent sequence recommendation models that utilize contrastive\nlearning predominantly rely on single-intent representations to direct the\ntraining process. However, this paradigm oversimplifies real-world\nrecommendation scenarios, attempting to encapsulate the diversity of intents\nwithin the single-intent level representation. SR models considering\nmulti-intent information in their framework are more likely to reflect\nreal-life recommendation scenarios accurately.\n","authors":["Junshu Huang","Zi Long","Xianghua Fu","Yin Chen"],"pdf_url":"https://arxiv.org/pdf/2409.08733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08732v1","updated":"2024-09-13T11:33:57Z","published":"2024-09-13T11:33:57Z","title":"Bridging Dynamic Factor Models and Neural Controlled Differential\n Equations for Nowcasting GDP","summary":" Gross domestic product (GDP) nowcasting is crucial for policy-making as GDP\ngrowth is a key indicator of economic conditions. Dynamic factor models (DFMs)\nhave been widely adopted by government agencies for GDP nowcasting due to their\nability to handle irregular or missing macroeconomic indicators and their\ninterpretability. However, DFMs face two main challenges: i) the lack of\ncapturing economic uncertainties such as sudden recessions or booms, and ii)\nthe limitation of capturing irregular dynamics from mixed-frequency data. To\naddress these challenges, we introduce NCDENow, a novel GDP nowcasting\nframework that integrates neural controlled differential equations (NCDEs) with\nDFMs. This integration effectively handles the dynamics of irregular time\nseries. NCDENow consists of 3 main modules: i) factor extraction leveraging\nDFM, ii) dynamic modeling using NCDE, and iii) GDP growth prediction through\nregression. We evaluate NCDENow against 6 baselines on 2 real-world GDP\ndatasets from South Korea and the United Kingdom, demonstrating its enhanced\npredictive capability. Our empirical results favor our method, highlighting the\nsignificant potential of integrating NCDE into nowcasting models. Our code and\ndataset are available at https://github.com/sklim84/NCDENow_CIKM2024.\n","authors":["Seonkyu Lim","Jeongwhan Choi","Noseong Park","Sang-Ha Yoon","ShinHyuck Kang","Young-Min Kim","Hyunjoong Kang"],"pdf_url":"https://arxiv.org/pdf/2409.08732v1.pdf","comment":"Accepted at CIKM 2024. Seonkyu Lim and Jeongwhan Choi are co-first\n authors with equal contributions"},{"id":"http://arxiv.org/abs/2409.08728v1","updated":"2024-09-13T11:30:42Z","published":"2024-09-13T11:30:42Z","title":"Disentangling the sources of cyber risk premia","summary":" We use a methodology based on a machine learning algorithm to quantify firms'\ncyber risks based on their disclosures and a dedicated cyber corpus. The model\ncan identify paragraphs related to determined cyber-threat types and\naccordingly attribute several related cyber scores to the firm. The cyber\nscores are unrelated to other firms' characteristics. Stocks with high cyber\nscores significantly outperform other stocks. The long-short cyber risk factors\nhave positive risk premia, are robust to all factors' benchmarks, and help\nprice returns. Furthermore, we suggest the market does not distinguish between\ndifferent types of cyber risks but instead views them as a single, aggregate\ncyber risk.\n","authors":["Loïc Maréchal","Nathan Monnet"],"pdf_url":"https://arxiv.org/pdf/2409.08728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08724v1","updated":"2024-09-13T11:26:05Z","published":"2024-09-13T11:26:05Z","title":"Quasimetric Value Functions with Dense Rewards","summary":" As a generalization of reinforcement learning (RL) to parametrizable goals,\ngoal conditioned RL (GCRL) has a broad range of applications, particularly in\nchallenging tasks in robotics. Recent work has established that the optimal\nvalue function of GCRL $Q^\\ast(s,a,g)$ has a quasimetric structure, leading to\ntargetted neural architectures that respect such structure. However, the\nrelevant analyses assume a sparse reward setting -- a known aggravating factor\nto sample complexity. We show that the key property underpinning a quasimetric,\nviz., the triangle inequality, is preserved under a dense reward setting as\nwell. Contrary to earlier findings where dense rewards were shown to be\ndetrimental to GCRL, we identify the key condition necessary for triangle\ninequality. Dense reward functions that satisfy this condition can only\nimprove, never worsen, sample complexity. This opens up opportunities to train\nefficient neural architectures with dense rewards, compounding their benefits\nto sample complexity. We evaluate this proposal in 12 standard benchmark\nenvironments in GCRL featuring challenging continuous control tasks. Our\nempirical results confirm that training a quasimetric value function in our\ndense reward setting indeed outperforms training with sparse rewards.\n","authors":["Khadichabonu Valieva","Bikramjit Banerjee"],"pdf_url":"https://arxiv.org/pdf/2409.08724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08712v1","updated":"2024-09-13T10:59:24Z","published":"2024-09-13T10:59:24Z","title":"Layerwise Change of Knowledge in Neural Networks","summary":" This paper aims to explain how a deep neural network (DNN) gradually extracts\nnew knowledge and forgets noisy features through layers in forward propagation.\nUp to now, although the definition of knowledge encoded by the DNN has not\nreached a consensus, Previous studies have derived a series of mathematical\nevidence to take interactions as symbolic primitive inference patterns encoded\nby a DNN. We extend the definition of interactions and, for the first time,\nextract interactions encoded by intermediate layers. We quantify and track the\nnewly emerged interactions and the forgotten interactions in each layer during\nthe forward propagation, which shed new light on the learning behavior of DNNs.\nThe layer-wise change of interactions also reveals the change of the\ngeneralization capacity and instability of feature representations of a DNN.\n","authors":["Xu Cheng","Lei Cheng","Zhaoran Peng","Yang Xu","Tian Han","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.08712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05629v2","updated":"2024-09-13T10:49:21Z","published":"2024-08-10T20:48:40Z","title":"Quantum-secure multiparty deep learning","summary":" Secure multiparty computation enables the joint evaluation of multivariate\nfunctions across distributed users while ensuring the privacy of their local\ninputs. This field has become increasingly urgent due to the exploding demand\nfor computationally intensive deep learning inference. These computations are\ntypically offloaded to cloud computing servers, leading to vulnerabilities that\ncan compromise the security of the clients' data. To solve this problem, we\nintroduce a linear algebra engine that leverages the quantum nature of light\nfor information-theoretically secure multiparty computation using only\nconventional telecommunication components. We apply this linear algebra engine\nto deep learning and derive rigorous upper bounds on the information leakage of\nboth the deep neural network weights and the client's data via the Holevo and\nthe Cram\\'er-Rao bounds, respectively. Applied to the MNIST classification\ntask, we obtain test accuracies exceeding $96\\%$ while leaking less than $0.1$\nbits per weight symbol and $0.01$ bits per data symbol. This weight leakage is\nan order of magnitude below the minimum bit precision required for accurate\ndeep learning using state-of-the-art quantization techniques. Our work lays the\nfoundation for practical quantum-secure computation and unlocks secure cloud\ndeep learning as a field.\n","authors":["Kfir Sulimany","Sri Krishna Vadlamani","Ryan Hamerly","Prahlad Iyengar","Dirk Englund"],"pdf_url":"https://arxiv.org/pdf/2408.05629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08706v1","updated":"2024-09-13T10:48:35Z","published":"2024-09-13T10:48:35Z","title":"L3Cube-IndicQuest: A Benchmark Questing Answering Dataset for Evaluating\n Knowledge of LLMs in Indic Context","summary":" Large Language Models (LLMs) have made significant progress in incorporating\nIndic languages within multilingual models. However, it is crucial to\nquantitatively assess whether these languages perform comparably to globally\ndominant ones, such as English. Currently, there is a lack of benchmark\ndatasets specifically designed to evaluate the regional knowledge of LLMs in\nvarious Indic languages. In this paper, we present the L3Cube-IndicQuest, a\ngold-standard question-answering benchmark dataset designed to evaluate how\nwell multilingual LLMs capture regional knowledge across various Indic\nlanguages. The dataset contains 200 question-answer pairs, each for English and\n19 Indic languages, covering five domains specific to the Indic region. We aim\nfor this dataset to serve as a benchmark, providing ground truth for evaluating\nthe performance of LLMs in understanding and representing knowledge relevant to\nthe Indian context. The IndicQuest can be used for both reference-based\nevaluation and LLM-as-a-judge evaluation. The dataset is shared publicly at\nhttps://github.com/l3cube-pune/indic-nlp .\n","authors":["Pritika Rohera","Chaitrali Ginimav","Akanksha Salunke","Gayatri Sawant","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2409.08706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15589v2","updated":"2024-09-13T10:47:25Z","published":"2024-07-22T12:26:08Z","title":"Exploring the Effectiveness of Object-Centric Representations in Visual\n Question Answering: Comparative Insights with Foundation Models","summary":" Object-centric (OC) representations, which represent the state of a visual\nscene by modeling it as a composition of objects, have the potential to be used\nin various downstream tasks to achieve systematic compositional generalization\nand facilitate reasoning. However, these claims have not been thoroughly\nanalyzed yet. Recently, foundation models have demonstrated unparalleled\ncapabilities across diverse domains from language to computer vision, marking\nthem as a potential cornerstone of future research for a multitude of\ncomputational tasks. In this paper, we conduct an extensive empirical study on\nrepresentation learning for downstream Visual Question Answering (VQA), which\nrequires an accurate compositional understanding of the scene. We thoroughly\ninvestigate the benefits and trade-offs of OC models and alternative approaches\nincluding large pre-trained foundation models on both synthetic and real-world\ndata, and demonstrate a viable way to achieve the best of both worlds. The\nextensiveness of our study, encompassing over 800 downstream VQA models and 15\ndifferent types of upstream representations, also provides several additional\ninsights that we believe will be of interest to the community at large.\n","authors":["Amir Mohammad Karimi Mamaghan","Samuele Papa","Karl Henrik Johansson","Stefan Bauer","Andrea Dittadi"],"pdf_url":"https://arxiv.org/pdf/2407.15589v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03493v3","updated":"2024-09-13T10:42:00Z","published":"2024-04-04T14:48:26Z","title":"A Methodology to Study the Impact of Spiking Neural Network Parameters\n considering Event-Based Automotive Data","summary":" Autonomous Driving (AD) systems are considered as the future of human\nmobility and transportation. Solving computer vision tasks such as image\nclassification and object detection/segmentation, with high accuracy and low\npower/energy consumption, is highly needed to realize AD systems in real life.\nThese requirements can potentially be satisfied by Spiking Neural Networks\n(SNNs). However, the state-of-the-art works in SNN-based AD systems still focus\non proposing network models that can achieve high accuracy, and they have not\nsystematically studied the roles of SNN parameters when used for learning\nevent-based automotive data. Therefore, we still lack understanding of how to\neffectively develop SNN models for AD systems. Toward this, we propose a novel\nmethodology to systematically study and analyze the impact of SNN parameters\nconsidering event-based automotive data, then leverage this analysis for\nenhancing SNN developments. To do this, we first explore different settings of\nSNN parameters that directly affect the learning mechanism (i.e., batch size,\nlearning rate, neuron threshold potential, and weight decay), then analyze the\naccuracy results. Afterward, we propose techniques that jointly improve SNN\naccuracy and reduce training time. Experimental results show that our\nmethodology can improve the SNN models for AD systems than the\nstate-of-the-art, as it achieves higher accuracy (i.e., 86%) for the NCARS\ndataset, and it can also achieve iso-accuracy (i.e., ~85% with standard\ndeviation less than 0.5%) while speeding up the training time by 1.9x. In this\nmanner, our research work provides a set of guidelines for SNN parameter\nenhancements, thereby enabling the practical developments of SNN-based AD\nsystems.\n","authors":["Iqra Bano","Rachmad Vidya Wicaksana Putra","Alberto Marchisio","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2404.03493v3.pdf","comment":"To appear at the 18th International Conference on Control,\n Automation, Robotics and Vision (ICARCV), December 2024, Dubai, UAE"},{"id":"http://arxiv.org/abs/2409.08700v1","updated":"2024-09-13T10:39:36Z","published":"2024-09-13T10:39:36Z","title":"Personalized Weight Loss Management through Wearable Devices and\n Artificial Intelligence","summary":" Early detection of chronic and Non-Communicable Diseases (NCDs) is crucial\nfor effective treatment during the initial stages. This study explores the\napplication of wearable devices and Artificial Intelligence (AI) in order to\npredict weight loss changes in overweight and obese individuals. Using wearable\ndata from a 1-month trial involving around 100 subjects from the AI4FoodDB\ndatabase, including biomarkers, vital signs, and behavioral data, we identify\nkey differences between those achieving weight loss (>= 2% of their initial\nweight) and those who do not. Feature selection techniques and classification\nalgorithms reveal promising results, with the Gradient Boosting classifier\nachieving 84.44% Area Under the Curve (AUC). The integration of multiple data\nsources (e.g., vital signs, physical and sleep activity, etc.) enhances\nperformance, suggesting the potential of wearable devices and AI in\npersonalized healthcare.\n","authors":["Sergio Romero-Tapiador","Ruben Tolosana","Aythami Morales","Blanca Lacruz-Pleguezuelos","Sofia Bosch Pastor","Laura Judith Marcos-Zambrano","Guadalupe X. Bazán","Gala Freixer","Ruben Vera-Rodriguez","Julian Fierrez","Javier Ortega-Garcia","Isabel Espinosa-Salinas","Enrique Carrillo de Santa Pau"],"pdf_url":"https://arxiv.org/pdf/2409.08700v1.pdf","comment":"15 pages, 5 figures, 6 tables, 1 appendix"},{"id":"http://arxiv.org/abs/2409.08695v1","updated":"2024-09-13T10:27:27Z","published":"2024-09-13T10:27:27Z","title":"Precision Aquaculture: An Integrated Computer Vision and IoT Approach\n for Optimized Tilapia Feeding","summary":" Traditional fish farming practices often lead to inefficient feeding,\nresulting in environmental issues and reduced productivity. We developed an\ninnovative system combining computer vision and IoT technologies for precise\nTilapia feeding. Our solution uses real-time IoT sensors to monitor water\nquality parameters and computer vision algorithms to analyze fish size and\ncount, determining optimal feed amounts. A mobile app enables remote monitoring\nand control. We utilized YOLOv8 for keypoint detection to measure Tilapia\nweight from length, achieving \\textbf{94\\%} precision on 3,500 annotated\nimages. Pixel-based measurements were converted to centimeters using depth\nestimation for accurate feeding calculations. Our method, with data collection\nmirroring inference conditions, significantly improved results. Preliminary\nestimates suggest this approach could increase production up to 58 times\ncompared to traditional farms. Our models, code, and dataset are\nopen-source~\\footnote{The code, dataset, and models are available upon\nreasonable request.\n","authors":["Rania Hossam","Ahmed Heakl","Walid Gomaa"],"pdf_url":"https://arxiv.org/pdf/2409.08695v1.pdf","comment":"8 pages, 6 figures, 3 tables, 21th International Conference on\n Informatics in Control, Automation, and Robotics"},{"id":"http://arxiv.org/abs/2409.08687v1","updated":"2024-09-13T10:07:28Z","published":"2024-09-13T10:07:28Z","title":"xTED: Cross-Domain Policy Adaptation via Diffusion-Based Trajectory\n Editing","summary":" Reusing pre-collected data from different domains is an attractive solution\nin decision-making tasks where the accessible data is insufficient in the\ntarget domain but relatively abundant in other related domains. Existing\ncross-domain policy transfer methods mostly aim at learning domain\ncorrespondences or corrections to facilitate policy learning, which requires\nlearning domain/task-specific model components, representations, or policies\nthat are inflexible or not fully reusable to accommodate arbitrary domains and\ntasks. These issues make us wonder: can we directly bridge the domain gap at\nthe data (trajectory) level, instead of devising complicated, domain-specific\npolicy transfer models? In this study, we propose a Cross-Domain Trajectory\nEDiting (xTED) framework with a new diffusion transformer model (Decision\nDiffusion Transformer, DDiT) that captures the trajectory distribution from the\ntarget dataset as a prior. The proposed diffusion transformer backbone captures\nthe intricate dependencies among state, action, and reward sequences, as well\nas the transition dynamics within the target data trajectories. With the above\npre-trained diffusion prior, source data trajectories with domain gaps can be\ntransformed into edited trajectories that closely resemble the target data\ndistribution through the diffusion-based editing process, which implicitly\ncorrects the underlying domain gaps, enhancing the state realism and dynamics\nreliability in source trajectory data, while enabling flexible choices of\ndownstream policy learning methods. Despite its simplicity, xTED demonstrates\nsuperior performance against other baselines in extensive simulation and\nreal-robot experiments.\n","authors":["Haoyi Niu","Qimao Chen","Tenglong Liu","Jianxiong Li","Guyue Zhou","Yi Zhang","Jianming Hu","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2409.08687v1.pdf","comment":"xTED offers a novel, generic, flexible, simple and effective paradigm\n that casts cross-domain policy adaptation as a data pre-processing problem"},{"id":"http://arxiv.org/abs/2402.14482v3","updated":"2024-09-13T09:54:46Z","published":"2024-02-22T12:15:05Z","title":"SpanSeq: Similarity-based sequence data splitting method for improved\n development and assessment of deep learning projects","summary":" The use of deep learning models in computational biology has increased\nmassively in recent years, and it is expected to continue with the current\nadvances in the fields such as Natural Language Processing. These models,\nalthough able to draw complex relations between input and target, are also\ninclined to learn noisy deviations from the pool of data used during their\ndevelopment. In order to assess their performance on unseen data (their\ncapacity to generalize), it is common to split the available data randomly into\ndevelopment (train/validation) and test sets. This procedure, although\nstandard, has been shown to produce dubious assessments of generalization due\nto the existing similarity between samples in the databases used. In this work,\nwe present SpanSeq, a database partition method for machine learning that can\nscale to most biological sequences (genes, proteins and genomes) in order to\navoid data leakage between sets. We also explore the effect of not restraining\nsimilarity between sets by reproducing the development of two state-of-the-art\nmodels on bioinformatics, not only confirming the consequences of randomly\nsplitting databases on the model assessment, but expanding those repercussions\nto the model development. SpanSeq is available at\nhttps://github.com/genomicepidemiology/SpanSeq.\n","authors":["Alfred Ferrer Florensa","Jose Juan Almagro Armenteros","Henrik Nielsen","Frank Møller Aarestrup","Philip Thomas Lanken Conradsen Clausen"],"pdf_url":"https://arxiv.org/pdf/2402.14482v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08676v1","updated":"2024-09-13T09:43:36Z","published":"2024-09-13T09:43:36Z","title":"Redesigning graph filter-based GNNs to relax the homophily assumption","summary":" Graph neural networks (GNNs) have become a workhorse approach for learning\nfrom data defined over irregular domains, typically by implicitly assuming that\nthe data structure is represented by a homophilic graph. However, recent works\nhave revealed that many relevant applications involve heterophilic data where\nthe performance of GNNs can be notably compromised. To address this challenge,\nwe present a simple yet effective architecture designed to mitigate the\nlimitations of the homophily assumption. The proposed architecture reinterprets\nthe role of graph filters in convolutional GNNs, resulting in a more general\narchitecture while incorporating a stronger inductive bias than GNNs based on\nfilter banks. The proposed convolutional layer enhances the expressive capacity\nof the architecture enabling it to learn from both homophilic and heterophilic\ndata and preventing the issue of oversmoothing. From a theoretical standpoint,\nwe show that the proposed architecture is permutation equivariant. Finally, we\nshow that the proposed GNNs compares favorably relative to several\nstate-of-the-art baselines in both homophilic and heterophilic datasets,\nshowcasing its promising potential.\n","authors":["Samuel Rey","Madeline Navarro","Victor M. Tenorio","Santiago Segarra","Antonio G. Marques"],"pdf_url":"https://arxiv.org/pdf/2409.08676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05611v2","updated":"2024-09-13T09:40:02Z","published":"2024-05-09T08:15:31Z","title":"Privacy-Preserving Edge Federated Learning for Intelligent Mobile-Health\n Systems","summary":" Machine Learning (ML) algorithms are generally designed for scenarios in\nwhich all data is stored in one data center, where the training is performed.\nHowever, in many applications, e.g., in the healthcare domain, the training\ndata is distributed among several entities, e.g., different hospitals or\npatients' mobile devices/sensors. At the same time, transferring the data to a\ncentral location for learning is certainly not an option, due to privacy\nconcerns and legal issues, and in certain cases, because of the communication\nand computation overheads. Federated Learning (FL) is the state-of-the-art\ncollaborative ML approach for training an ML model across multiple parties\nholding local data samples, without sharing them. However, enabling learning\nfrom distributed data over such edge Internet of Things (IoT) systems (e.g.,\nmobile-health and wearable technologies, involving sensitive personal/medical\ndata) in a privacy-preserving fashion presents a major challenge mainly due to\ntheir stringent resource constraints, i.e., limited computing capacity,\ncommunication bandwidth, memory storage, and battery lifetime. In this paper,\nwe propose a privacy-preserving edge FL framework for resource-constrained\nmobile-health and wearable technologies over the IoT infrastructure. We\nevaluate our proposed framework extensively and provide the implementation of\nour technique on Amazon's AWS cloud platform based on the seizure detection\napplication in epilepsy monitoring using wearable technologies.\n","authors":["Amin Aminifar","Matin Shokri","Amir Aminifar"],"pdf_url":"https://arxiv.org/pdf/2405.05611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01829v2","updated":"2024-09-13T09:39:53Z","published":"2024-06-03T22:56:40Z","title":"FaçAID: A Transformer Model for Neuro-Symbolic Facade Reconstruction","summary":" We introduce a neuro-symbolic transformer-based model that converts flat,\nsegmented facade structures into procedural definitions using a custom-designed\nsplit grammar. To facilitate this, we first develop a semi-complex split\ngrammar tailored for architectural facades and then generate a dataset\ncomprising of facades alongside their corresponding procedural representations.\nThis dataset is used to train our transformer model to convert segmented, flat\nfacades into the procedural language of our grammar. During inference, the\nmodel applies this learned transformation to new facade segmentations,\nproviding a procedural representation that users can adjust to generate varied\nfacade designs. This method not only automates the conversion of static facade\nimages into dynamic, editable procedural formats but also enhances the design\nflexibility, allowing for easy modifications.\n","authors":["Aleksander Plocharski","Jan Swidzinski","Joanna Porter-Sobieraj","Przemyslaw Musialski"],"pdf_url":"https://arxiv.org/pdf/2406.01829v2.pdf","comment":"11 pages, 11 figures, in ACM SIGGRAPH Asia 2024 Conference Papers\n Proceedings"},{"id":"http://arxiv.org/abs/2409.08673v1","updated":"2024-09-13T09:37:44Z","published":"2024-09-13T09:37:44Z","title":"Acoustic identification of individual animals with hierarchical\n contrastive learning","summary":" Acoustic identification of individual animals (AIID) is closely related to\naudio-based species classification but requires a finer level of detail to\ndistinguish between individual animals within the same species. In this work,\nwe frame AIID as a hierarchical multi-label classification task and propose the\nuse of hierarchy-aware loss functions to learn robust representations of\nindividual identities that maintain the hierarchical relationships among\nspecies and taxa. Our results demonstrate that hierarchical embeddings not only\nenhance identification accuracy at the individual level but also at higher\ntaxonomic levels, effectively preserving the hierarchical structure in the\nlearned representations. By comparing our approach with non-hierarchical\nmodels, we highlight the advantage of enforcing this structure in the embedding\nspace. Additionally, we extend the evaluation to the classification of novel\nindividual classes, demonstrating the potential of our method in open-set\nclassification scenarios.\n","authors":["Ines Nolasco","Ilyass Moummad","Dan Stowell","Emmanouil Benetos"],"pdf_url":"https://arxiv.org/pdf/2409.08673v1.pdf","comment":"Under review; Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2311.15649v3","updated":"2024-09-13T09:36:18Z","published":"2023-11-27T09:20:23Z","title":"RoboGPT: an intelligent agent of making embodied long-term decisions for\n daily instruction tasks","summary":" Robotic agents must master common sense and long-term sequential decisions to\nsolve daily tasks through natural language instruction. The developments in\nLarge Language Models (LLMs) in natural language processing have inspired\nefforts to use LLMs in complex robot planning. Despite LLMs' great\ngeneralization and comprehension of instruction tasks, LLMs-generated task\nplans sometimes lack feasibility and correctness. To address the problem, we\npropose a RoboGPT agent\\footnote{our code and dataset will be released soon}\nfor making embodied long-term decisions for daily tasks, with two modules: 1)\nLLMs-based planning with re-plan to break the task into multiple sub-goals; 2)\nRoboSkill individually designed for sub-goals to learn better navigation and\nmanipulation skills. The LLMs-based planning is enhanced with a new robotic\ndataset and re-plan, called RoboGPT. The new robotic dataset of 67k daily\ninstruction tasks is gathered for fine-tuning the Llama model and obtaining\nRoboGPT. RoboGPT planner with strong generalization can plan hundreds of daily\ninstruction tasks. Additionally, a low-computational Re-Plan module is designed\nto allow plans to flexibly adapt to the environment, thereby addressing the\nnomenclature diversity challenge. The proposed RoboGPT agent outperforms SOTA\nmethods on the ALFRED daily tasks. Moreover, RoboGPT planner exceeds SOTA\nLLM-based planners like ChatGPT in task-planning rationality for hundreds of\nunseen daily tasks, and even other domain tasks, while keeping the large\nmodel's original broad application and generality.\n","authors":["Yaran Chen","Wenbo Cui","Yuanwen Chen","Mining Tan","Xinyao Zhang","Dongbin Zhao","He Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03708v2","updated":"2024-09-13T09:33:59Z","published":"2024-04-04T11:22:58Z","title":"Dendrites endow artificial neural networks with accurate, robust and\n parameter-efficient learning","summary":" Artificial neural networks (ANNs) are at the core of most Deep learning (DL)\nalgorithms that successfully tackle complex problems like image recognition,\nautonomous driving, and natural language processing. However, unlike biological\nbrains who tackle similar problems in a very efficient manner, DL algorithms\nrequire a large number of trainable parameters, making them energy-intensive\nand prone to overfitting. Here, we show that a new ANN architecture that\nincorporates the structured connectivity and restricted sampling properties of\nbiological dendrites counteracts these limitations. We find that dendritic ANNs\nare more robust to overfitting and outperform traditional ANNs on several image\nclassification tasks while using significantly fewer trainable parameters.\nThese advantages are likely the result of a different learning strategy,\nwhereby most of the nodes in dendritic ANNs respond to multiple classes, unlike\nclassical ANNs that strive for class-specificity. Our findings suggest that the\nincorporation of dendritic properties can make learning in ANNs more precise,\nresilient, and parameter-efficient and shed new light on how biological\nfeatures can impact the learning strategies of ANNs.\n","authors":["Spyridon Chavlis","Panayiota Poirazi"],"pdf_url":"https://arxiv.org/pdf/2404.03708v2.pdf","comment":"69 pages, 6 main and 11 supplementary figures, 2 main and 3\n supplementary tables"},{"id":"http://arxiv.org/abs/2407.13070v2","updated":"2024-09-13T09:33:20Z","published":"2024-05-28T21:54:03Z","title":"The Cost of Arbitrariness for Individuals: Examining the Legal and\n Technical Challenges of Model Multiplicity","summary":" Model multiplicity, the phenomenon where multiple models achieve similar\nperformance despite different underlying learned functions, introduces\narbitrariness in model selection. While this arbitrariness may seem\ninconsequential in expectation, its impact on individuals can be severe. This\npaper explores various individual concerns stemming from multiplicity,\nincluding the effects of arbitrariness beyond final predictions, disparate\narbitrariness for individuals belonging to protected groups, and the challenges\nassociated with the arbitrariness of a single algorithmic system creating a\nmonopoly across various contexts. It provides both an empirical examination of\nthese concerns and a comprehensive analysis from the legal standpoint,\naddressing how these issues are perceived in the anti-discrimination law in\nCanada. We conclude the discussion with technical challenges in the current\nlandscape of model multiplicity to meet legal requirements and the legal gap\nbetween current law and the implications of arbitrariness in model selection,\nhighlighting relevant future research directions for both disciplines.\n","authors":["Prakhar Ganesh","Ihsan Ibrahim Daldaban","Ignacio Cofone","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2407.13070v2.pdf","comment":"Current version of the paper contains errors in the attribution of\n previous work. We are working on creating a new version, which can take a\n while and thus are withdrawing this version in the meantime"},{"id":"http://arxiv.org/abs/2409.08666v1","updated":"2024-09-13T09:27:59Z","published":"2024-09-13T09:27:59Z","title":"Towards certifiable AI in aviation: landscape, challenges, and\n opportunities","summary":" Artificial Intelligence (AI) methods are powerful tools for various domains,\nincluding critical fields such as avionics, where certification is required to\nachieve and maintain an acceptable level of safety. General solutions for\nsafety-critical systems must address three main questions: Is it suitable? What\ndrives the system's decisions? Is it robust to errors/attacks? This is more\ncomplex in AI than in traditional methods. In this context, this paper presents\na comprehensive mind map of formal AI certification in avionics. It highlights\nthe challenges of certifying AI development with an example to emphasize the\nneed for qualification beyond performance metrics.\n","authors":["Hymalai Bello","Daniel Geißler","Lala Ray","Stefan Müller-Divéky","Peter Müller","Shannon Kittrell","Mengxi Liu","Bo Zhou","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2409.08666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08664v1","updated":"2024-09-13T09:27:05Z","published":"2024-09-13T09:27:05Z","title":"Investigating Disentanglement in a Phoneme-level Speech Codec for\n Prosody Modeling","summary":" Most of the prevalent approaches in speech prosody modeling rely on learning\nglobal style representations in a continuous latent space which encode and\ntransfer the attributes of reference speech. However, recent work on neural\ncodecs which are based on Residual Vector Quantization (RVQ) already shows\ngreat potential offering distinct advantages. We investigate the prosody\nmodeling capabilities of the discrete space of such an RVQ-VAE model, modifying\nit to operate on the phoneme-level. We condition both the encoder and decoder\nof the model on linguistic representations and apply a global speaker embedding\nin order to factor out both phonetic and speaker information. We conduct an\nextensive set of investigations based on subjective experiments and objective\nmeasures to show that the phoneme-level discrete latent representations\nobtained this way achieves a high degree of disentanglement, capturing\nfine-grained prosodic information that is robust and transferable. The latent\nspace turns out to have interpretable structure with its principal components\ncorresponding to pitch and energy.\n","authors":["Sotirios Karapiperis","Nikolaos Ellinas","Alexandra Vioni","Junkwang Oh","Gunu Jho","Inchul Hwang","Spyros Raptis"],"pdf_url":"https://arxiv.org/pdf/2409.08664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01939v2","updated":"2024-09-13T09:22:38Z","published":"2023-05-03T07:32:28Z","title":"Where We Have Arrived in Proving the Emergence of Sparse Symbolic\n Concepts in AI Models","summary":" This study aims to prove the emergence of symbolic concepts (or more\nprecisely, sparse primitive inference patterns) in well-trained deep neural\nnetworks (DNNs). Specifically, we prove the following three conditions for the\nemergence. (i) The high-order derivatives of the network output with respect to\nthe input variables are all zero. (ii) The DNN can be used on occluded samples\nand when the input sample is less occluded, the DNN will yield higher\nconfidence. (iii) The confidence of the DNN does not significantly degrade on\noccluded samples. These conditions are quite common, and we prove that under\nthese conditions, the DNN will only encode a relatively small number of sparse\ninteractions between input variables. Moreover, we can consider such\ninteractions as symbolic primitive inference patterns encoded by a DNN, because\nwe show that inference scores of the DNN on an exponentially large number of\nrandomly masked samples can always be well mimicked by numerical effects of\njust a few interactions.\n","authors":["Qihan Ren","Jiayang Gao","Wen Shen","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.01939v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08660v1","updated":"2024-09-13T09:20:42Z","published":"2024-09-13T09:20:42Z","title":"Online Learning Of Expanding Graphs","summary":" This paper addresses the problem of online network topology inference for\nexpanding graphs from a stream of spatiotemporal signals. Online algorithms for\ndynamic graph learning are crucial in delay-sensitive applications or when\nchanges in topology occur rapidly. While existing works focus on inferring the\nconnectivity within a fixed set of nodes, in practice, the graph can grow as\nnew nodes join the network. This poses additional challenges like modeling\ntemporal dynamics involving signals and graphs of different sizes. This growth\nalso increases the computational complexity of the learning process, which may\nbecome prohibitive. To the best of our knowledge, this is the first work to\ntackle this setting. We propose a general online algorithm based on projected\nproximal gradient descent that accounts for the increasing graph size at each\niteration. Recursively updating the sample covariance matrix is a key aspect of\nour approach. We introduce a strategy that enables different types of updates\nfor nodes that just joined the network and for previously existing nodes. To\nprovide further insights into the proposed method, we specialize it in Gaussian\nMarkov random field settings, where we analyze the computational complexity and\ncharacterize the dynamic cumulative regret. Finally, we demonstrate the\neffectiveness of the proposed approach using both controlled experiments and\nreal-world datasets from epidemic and financial networks.\n","authors":["Samuel Rey","Bishwadeep Das","Elvin Isufi"],"pdf_url":"https://arxiv.org/pdf/2409.08660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13091v2","updated":"2024-09-13T09:19:14Z","published":"2023-02-25T14:44:40Z","title":"Explaining Generalization Power of a DNN Using Interactive Concepts","summary":" This paper explains the generalization power of a deep neural network (DNN)\nfrom the perspective of interactions. Although there is no universally accepted\ndefinition of the concepts encoded by a DNN, the sparsity of interactions in a\nDNN has been proved, i.e., the output score of a DNN can be well explained by a\nsmall number of interactions between input variables. In this way, to some\nextent, we can consider such interactions as interactive concepts encoded by\nthe DNN. Therefore, in this paper, we derive an analytic explanation of\ninconsistency of concepts of different complexities. This may shed new lights\non using the generalization power of concepts to explain the generalization\npower of the entire DNN. Besides, we discover that the DNN with stronger\ngeneralization power usually learns simple concepts more quickly and encodes\nfewer complex concepts. We also discover the detouring dynamics of learning\ncomplex concepts, which explains both the high learning difficulty and the low\ngeneralization power of complex concepts. The code will be released when the\npaper is accepted.\n","authors":["Huilin Zhou","Hao Zhang","Huiqi Deng","Dongrui Liu","Wen Shen","Shih-Han Chan","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08658v1","updated":"2024-09-13T09:18:29Z","published":"2024-09-13T09:18:29Z","title":"Promoting Fairness in Link Prediction with Graph Enhancement","summary":" Link prediction is a crucial task in network analysis, but it has been shown\nto be prone to biased predictions, particularly when links are unfairly\npredicted between nodes from different sensitive groups. In this paper, we\nstudy the fair link prediction problem, which aims to ensure that the predicted\nlink probability is independent of the sensitive attributes of the connected\nnodes. Existing methods typically incorporate debiasing techniques within graph\nembeddings to mitigate this issue. However, training on large real-world graphs\nis already challenging, and adding fairness constraints can further complicate\nthe process. To overcome this challenge, we propose FairLink, a method that\nlearns a fairness-enhanced graph to bypass the need for debiasing during the\nlink predictor's training. FairLink maintains link prediction accuracy by\nensuring that the enhanced graph follows a training trajectory similar to that\nof the original input graph. Meanwhile, it enhances fairness by minimizing the\nabsolute difference in link probabilities between node pairs within the same\nsensitive group and those between node pairs from different sensitive groups.\nOur extensive experiments on multiple large-scale graphs demonstrate that\nFairLink not only promotes fairness but also often achieves link prediction\naccuracy comparable to baseline methods. Most importantly, the enhanced graph\nexhibits strong generalizability across different GNN architectures.\n","authors":["Yezi Liu","Hanning Chen","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2409.08658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08655v1","updated":"2024-09-13T09:14:06Z","published":"2024-09-13T09:14:06Z","title":"LMAC-TD: Producing Time Domain Explanations for Audio Classifiers","summary":" Neural networks are typically black-boxes that remain opaque with regards to\ntheir decision mechanisms. Several works in the literature have proposed\npost-hoc explanation methods to alleviate this issue. This paper proposes\nLMAC-TD, a post-hoc explanation method that trains a decoder to produce\nexplanations directly in the time domain. This methodology builds upon the\nfoundation of L-MAC, Listenable Maps for Audio Classifiers, a method that\nproduces faithful and listenable explanations. We incorporate SepFormer, a\npopular transformer-based time-domain source separation architecture. We show\nthrough a user study that LMAC-TD significantly improves the audio quality of\nthe produced explanations while not sacrificing from faithfulness.\n","authors":["Eleonora Mancini","Francesco Paissan","Mirco Ravanelli","Cem Subakan"],"pdf_url":"https://arxiv.org/pdf/2409.08655v1.pdf","comment":"The first two authors contributed equally to this research. Author\n order is alphabetical"},{"id":"http://arxiv.org/abs/2409.08647v1","updated":"2024-09-13T09:09:24Z","published":"2024-09-13T09:09:24Z","title":"Training Gradient Boosted Decision Trees on Tabular Data Containing\n Label Noise for Classification Tasks","summary":" Label noise refers to the phenomenon where instances in a data set are\nassigned to the wrong label. Label noise is harmful to classifier performance,\nincreases model complexity and impairs feature selection. Addressing label\nnoise is crucial, yet current research primarily focuses on image and text data\nusing deep neural networks. This leaves a gap in the study of tabular data and\ngradient-boosted decision trees (GBDTs), the leading algorithm for tabular\ndata. Different methods have already been developed which either try to filter\nlabel noise, model label noise while simultaneously training a classifier or\nuse learning algorithms which remain effective even if label noise is present.\nThis study aims to further investigate the effects of label noise on\ngradient-boosted decision trees and methods to mitigate those effects. Through\ncomprehensive experiments and analysis, the implemented methods demonstrate\nstate-of-the-art noise detection performance on the Adult dataset and achieve\nthe highest classification precision and recall on the Adult and Breast Cancer\ndatasets, respectively. In summary, this paper enhances the understanding of\nthe impact of label noise on GBDTs and lays the groundwork for future research\nin noise detection and correction methods.\n","authors":["Anita Eisenbürger","Daniel Otten","Anselm Hudde","Frank Hopfgartner"],"pdf_url":"https://arxiv.org/pdf/2409.08647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14286v3","updated":"2024-09-13T08:59:47Z","published":"2023-05-23T17:30:10Z","title":"Equivariant Neural Simulators for Stochastic Spatiotemporal Dynamics","summary":" Neural networks are emerging as a tool for scalable data-driven simulation of\nhigh-dimensional dynamical systems, especially in settings where numerical\nmethods are infeasible or computationally expensive. Notably, it has been shown\nthat incorporating domain symmetries in deterministic neural simulators can\nsubstantially improve their accuracy, sample efficiency, and parameter\nefficiency. However, to incorporate symmetries in probabilistic neural\nsimulators that can simulate stochastic phenomena, we need a model that\nproduces equivariant distributions over trajectories, rather than equivariant\nfunction approximations. In this paper, we propose Equivariant Probabilistic\nNeural Simulation (EPNS), a framework for autoregressive probabilistic modeling\nof equivariant distributions over system evolutions. We use EPNS to design\nmodels for a stochastic n-body system and stochastic cellular dynamics. Our\nresults show that EPNS considerably outperforms existing neural network-based\nmethods for probabilistic simulation. More specifically, we demonstrate that\nincorporating equivariance in EPNS improves simulation quality, data\nefficiency, rollout stability, and uncertainty quantification. We conclude that\nEPNS is a promising method for efficient and effective data-driven\nprobabilistic simulation in a diverse range of domains.\n","authors":["Koen Minartz","Yoeri Poels","Simon Koop","Vlado Menkovski"],"pdf_url":"https://arxiv.org/pdf/2305.14286v3.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2409.08642v1","updated":"2024-09-13T08:59:31Z","published":"2024-09-13T08:59:31Z","title":"CPL: Critical Planning Step Learning Boosts LLM Generalization in\n Reasoning Tasks","summary":" Post-training large language models (LLMs) to develop reasoning capabilities\nhas proven effective across diverse domains, such as mathematical reasoning and\ncode generation. However, existing methods primarily focus on improving\ntask-specific reasoning but have not adequately addressed the model's\ngeneralization capabilities across a broader range of reasoning tasks. To\ntackle this challenge, we introduce Critical Planning Step Learning (CPL),\nwhich leverages Monte Carlo Tree Search (MCTS) to explore diverse planning\nsteps in multi-step reasoning tasks. Based on long-term outcomes, CPL learns\nstep-level planning preferences to improve the model's planning capabilities\nand, consequently, its general reasoning capabilities. Furthermore, while\neffective in many scenarios for aligning LLMs, existing preference learning\napproaches like Direct Preference Optimization (DPO) struggle with complex\nmulti-step reasoning tasks due to their inability to capture fine-grained\nsupervision at each step. We propose Step-level Advantage Preference\nOptimization (Step-APO), which integrates an advantage estimate for step-level\npreference pairs obtained via MCTS into the DPO. This enables the model to more\neffectively learn critical intermediate planning steps, thereby further\nimproving its generalization in reasoning tasks. Experimental results\ndemonstrate that our method, trained exclusively on GSM8K and MATH, not only\nsignificantly improves performance on GSM8K (+10.5%) and MATH (+6.5%), but also\nenhances out-of-domain reasoning benchmarks, such as ARC-C (+4.0%), BBH\n(+1.8%), MMLU-STEM (+2.2%), and MMLU (+0.9%).\n","authors":["Tianlong Wang","Xueting Han","Jing Bai"],"pdf_url":"https://arxiv.org/pdf/2409.08642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03953v3","updated":"2024-09-13T08:55:10Z","published":"2024-07-04T14:14:09Z","title":"Generalizing Graph Transformers Across Diverse Graphs and Tasks via\n Pre-Training on Industrial-Scale Data","summary":" Graph pre-training has been concentrated on graph-level on small graphs\n(e.g., molecular graphs) or learning node representations on a fixed graph.\nExtending graph pre-trained models to web-scale graphs with billions of nodes\nin industrial scenarios, while avoiding negative transfer across graphs or\ntasks, remains a challenge. We aim to develop a general graph pre-trained model\nwith inductive ability that can make predictions for unseen new nodes and even\nnew graphs. In this work, we introduce a scalable transformer-based graph\npre-training framework called PGT (Pre-trained Graph Transformer).\nSpecifically, we design a flexible and scalable graph transformer as the\nbackbone network. Meanwhile, based on the masked autoencoder architecture, we\ndesign two pre-training tasks: one for reconstructing node features and the\nother one for reconstructing local structures. Unlike the original autoencoder\narchitecture where the pre-trained decoder is discarded, we propose a novel\nstrategy that utilizes the decoder for feature augmentation. We have deployed\nour framework on Tencent's online game data. Extensive experiments have\ndemonstrated that our framework can perform pre-training on real-world\nweb-scale graphs with over 540 million nodes and 12 billion edges and\ngeneralizes effectively to unseen new graphs with different downstream tasks.\nWe further conduct experiments on the publicly available ogbn-papers100M\ndataset, which consists of 111 million nodes and 1.6 billion edges. Our\nframework achieves state-of-the-art performance on both industrial datasets and\npublic datasets, while also enjoying scalability and efficiency.\n","authors":["Yufei He","Zhenyu Hou","Yukuo Cen","Feng He","Xu Cheng","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2407.03953v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2409.08640v1","updated":"2024-09-13T08:53:10Z","published":"2024-09-13T08:53:10Z","title":"Byzantine-Robust and Communication-Efficient Distributed Learning via\n Compressed Momentum Filtering","summary":" Distributed learning has become the standard approach for training\nlarge-scale machine learning models across private data silos. While\ndistributed learning enhances privacy preservation and training efficiency, it\nfaces critical challenges related to Byzantine robustness and communication\nreduction. Existing Byzantine-robust and communication-efficient methods rely\non full gradient information either at every iteration or at certain iterations\nwith a probability, and they only converge to an unnecessarily large\nneighborhood around the solution. Motivated by these issues, we propose a novel\nByzantine-robust and communication-efficient stochastic distributed learning\nmethod that imposes no requirements on batch size and converges to a smaller\nneighborhood around the optimal solution than all existing methods, aligning\nwith the theoretical lower bound. Our key innovation is leveraging Polyak\nMomentum to mitigate the noise caused by both biased compressors and stochastic\ngradients, thus defending against Byzantine workers under information\ncompression. We provide proof of tight complexity bounds for our algorithm in\nthe context of non-convex smooth loss functions, demonstrating that these\nbounds match the lower bounds in Byzantine-free scenarios. Finally, we validate\nthe practical significance of our algorithm through an extensive series of\nexperiments, benchmarking its performance on both binary classification and\nimage classification tasks.\n","authors":["Changxin Liu","Yanghao Li","Yuhao Yi","Karl H. Johansson"],"pdf_url":"https://arxiv.org/pdf/2409.08640v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.08636v1","updated":"2024-09-13T08:43:42Z","published":"2024-09-13T08:43:42Z","title":"Utilizing Data Fingerprints for Privacy-Preserving Algorithm Selection\n in Time Series Classification: Performance and Uncertainty Estimation on\n Unseen Datasets","summary":" The selection of algorithms is a crucial step in designing AI services for\nreal-world time series classification use cases. Traditional methods such as\nneural architecture search, automated machine learning, combined algorithm\nselection, and hyperparameter optimizations are effective but require\nconsiderable computational resources and necessitate access to all data points\nto run their optimizations. In this work, we introduce a novel data fingerprint\nthat describes any time series classification dataset in a privacy-preserving\nmanner and provides insight into the algorithm selection problem without\nrequiring training on the (unseen) dataset. By decomposing the multi-target\nregression problem, only our data fingerprints are used to estimate algorithm\nperformance and uncertainty in a scalable and adaptable manner. Our approach is\nevaluated on the 112 University of California riverside benchmark datasets,\ndemonstrating its effectiveness in predicting the performance of 35\nstate-of-the-art algorithms and providing valuable insights for effective\nalgorithm selection in time series classification service systems, improving a\nnaive baseline by 7.32% on average in estimating the mean performance and\n15.81% in estimating the uncertainty.\n","authors":["Lars Böcking","Leopold Müller","Niklas Kühl"],"pdf_url":"https://arxiv.org/pdf/2409.08636v1.pdf","comment":"Hawaii International Conference on System Sciences (HICSS-58) 2025"},{"id":"http://arxiv.org/abs/2409.08633v1","updated":"2024-09-13T08:37:23Z","published":"2024-09-13T08:37:23Z","title":"Improving Analog Neural Network Robustness: A Noise-Agnostic Approach\n with Explainable Regularizations","summary":" This work tackles the critical challenge of mitigating \"hardware noise\" in\ndeep analog neural networks, a major obstacle in advancing analog signal\nprocessing devices. We propose a comprehensive, hardware-agnostic solution to\naddress both correlated and uncorrelated noise affecting the activation layers\nof deep neural models. The novelty of our approach lies in its ability to\ndemystify the \"black box\" nature of noise-resilient networks by revealing the\nunderlying mechanisms that reduce sensitivity to noise. In doing so, we\nintroduce a new explainable regularization framework that harnesses these\nmechanisms to significantly enhance noise robustness in deep neural\narchitectures.\n","authors":["Alice Duque","Pedro Freire","Egor Manuylovich","Dmitrii Stoliarov","Jaroslaw Prilepsky","Sergei Turitsyn"],"pdf_url":"https://arxiv.org/pdf/2409.08633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08621v1","updated":"2024-09-13T08:18:01Z","published":"2024-09-13T08:18:01Z","title":"Co-Optimization of Robot Design and Control: Enhancing Performance and\n Understanding Design Complexity","summary":" The design (shape) of a robot is usually decided before the control is\nimplemented. This might limit how well the design is adapted to a task, as the\nsuitability of the design is given by how well the robot performs in the task,\nwhich requires both a design and a controller. The co-optimization or\nsimultaneous optimization of the design and control of robots addresses this\nlimitation by producing a design and control that are both adapted to the task.\nIn this paper, we investigate some of the challenges inherent in the\nco-optimization of design and control. We show that retraining the controller\nof a robot with additional resources after the co-optimization process\nterminates significantly improves the robot's performance. In addition, we\ndemonstrate that the resources allocated to training the controller for each\ndesign influence the design complexity, where simpler designs are associated\nwith lower training budgets. The experimentation is conducted in four publicly\navailable simulation environments for co-optimization of design and control,\nmaking the findings more applicable to the general case. The results presented\nin this paper hope to guide other practitioners in the co-optimization of\ndesign and control of robots.\n","authors":["Etor Arza","Frank Veenstra","Tønnes F. Nygaard","Kyrre Glette"],"pdf_url":"https://arxiv.org/pdf/2409.08621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07884v2","updated":"2024-09-13T08:00:12Z","published":"2024-09-12T09:44:13Z","title":"Graph Neural Networks for Parkinsons Disease Detection","summary":" Despite the promising performance of state of the art approaches for\nParkinsons Disease (PD) detection, these approaches often analyze individual\nspeech segments in isolation, which can lead to suboptimal results. Dysarthric\ncues that characterize speech impairments from PD patients are expected to be\nrelated across segments from different speakers. Isolated segment analysis\nfails to exploit these inter segment relationships. Additionally, not all\nspeech segments from PD patients exhibit clear dysarthric symptoms, introducing\nlabel noise that can negatively affect the performance and generalizability of\ncurrent approaches. To address these challenges, we propose a novel PD\ndetection framework utilizing Graph Convolutional Networks (GCNs). By\nrepresenting speech segments as nodes and capturing the similarity between\nsegments through edges, our GCN model facilitates the aggregation of dysarthric\ncues across the graph, effectively exploiting segment relationships and\nmitigating the impact of label noise. Experimental results demonstrate\ntheadvantages of the proposed GCN model for PD detection and provide insights\ninto its underlying mechanisms\n","authors":["Shakeel A. Sheikh","Yacouba Kaloga","Ina Kodrasi"],"pdf_url":"https://arxiv.org/pdf/2409.07884v2.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.08609v1","updated":"2024-09-13T07:52:45Z","published":"2024-09-13T07:52:45Z","title":"Optimizing Item-based Marketing Promotion Efficiency in C2C Marketplace\n with Dynamic Sequential Coupon Allocation Framework","summary":" In e-commerce platforms, coupons play a crucial role in boosting\ntransactions. In the customer-to-customer (C2C) marketplace, ensuring the\nsatisfaction of both buyers and sellers is essential. While buyer-focused\nmarketing strategies often receive more attention, addressing the needs of\nsellers is equally important. Additionally, the existing strategies tend to\noptimize each promotion independently, resulting in a lack of continuity\nbetween promotions and unnecessary costs in the pursuit of short-term impact\nwithin each promotion period.\n We introduce a Dynamic Sequential Coupon Allocation Framework (DSCAF) to\noptimize item coupon allocation strategies across a series of promotions. DSCAF\nprovides sequential recommendations for coupon configurations and timing to\ntarget items. In cases where initial suggestions do not lead to sales, it\ndynamically adjusts the strategy and offers subsequent solutions. It integrates\ntwo predictors for estimating the sale propensity in the current and subsequent\nrounds of coupon allocation, and a decision-making process to determine the\ncoupon allocation solution. It runs iteratively until the item is sold. The\ngoal of the framework is to maximize Return on Investment (ROI) while ensuring\nlift Sell-through Rate (STR) remains above a specified threshold. DSCAF aims to\noptimize sequential coupon efficiency with a long-term perspective rather than\nsolely focusing on the lift achieved in each individual promotion. It has been\napplied for item coupon allocation in Mercari.\n","authors":["Jie Yang","Padunna Valappil Krishnaraj Sekhar","Sho Sekine","Yilin Li"],"pdf_url":"https://arxiv.org/pdf/2409.08609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08595v1","updated":"2024-09-13T07:27:55Z","published":"2024-09-13T07:27:55Z","title":"Automatic Generation of Fast and Accurate Performance Models for Deep\n Neural Network Accelerators","summary":" Implementing Deep Neural Networks (DNNs) on resource-constrained edge devices\nis a challenging task that requires tailored hardware accelerator architectures\nand a clear understanding of their performance characteristics when executing\nthe intended AI workload. To facilitate this, we present an automated\ngeneration approach for fast performance models to accurately estimate the\nlatency of a DNN mapped onto systematically modeled and concisely described\naccelerator architectures. Using our accelerator architecture description\nmethod, we modeled representative DNN accelerators such as Gemmini, UltraTrail,\nPlasticine-derived, and a parameterizable systolic array. Together with DNN\nmappings for those modeled architectures, we perform a combined DNN/hardware\ndependency graph analysis, which enables us, in the best case, to evaluate only\n154 loop kernel iterations to estimate the performance for 4.19 billion\ninstructions achieving a significant speedup. We outperform regression and\nanalytical models in terms of mean absolute percentage error (MAPE) compared to\nsimulation results, while being several magnitudes faster than an RTL\nsimulation.\n","authors":["Konstantin Lübeck","Alexander Louis-Ferdinand Jung","Felix Wedlich","Mika Markus Müller","Federico Nicolás Peccia","Felix Thömmes","Jannik Steinmetz","Valentin Biermaier","Adrian Frischknecht","Paul Palomero Bernardo","Oliver Bringmann"],"pdf_url":"https://arxiv.org/pdf/2409.08595v1.pdf","comment":"Accepted version for: ACM Transactions on Embedded Computing Systems"},{"id":"http://arxiv.org/abs/2409.08584v1","updated":"2024-09-13T07:03:01Z","published":"2024-09-13T07:03:01Z","title":"CompressedMediQ: Hybrid Quantum Machine Learning Pipeline for\n High-Dimentional Neuroimaging Data","summary":" This paper introduces CompressedMediQ, a novel hybrid quantum-classical\nmachine learning pipeline specifically developed to address the computational\nchallenges associated with high-dimensional multi-class neuroimaging data\nanalysis. Standard neuroimaging datasets, such as 4D MRI data from the\nAlzheimer's Disease Neuroimaging Initiative (ADNI) and Neuroimaging in\nFrontotemporal Dementia (NIFD), present significant hurdles due to their vast\nsize and complexity. CompressedMediQ integrates classical high-performance\ncomputing (HPC) nodes for advanced MRI pre-processing and Convolutional Neural\nNetwork (CNN)-PCA-based feature extraction and reduction, addressing the\nlimited-qubit availability for quantum data encoding in the NISQ (Noisy\nIntermediate-Scale Quantum) era. This is followed by Quantum Support Vector\nMachine (QSVM) classification. By utilizing quantum kernel methods, the\npipeline optimizes feature mapping and classification, enhancing data\nseparability and outperforming traditional neuroimaging analysis techniques.\nExperimental results highlight the pipeline's superior accuracy in dementia\nstaging, validating the practical use of quantum machine learning in clinical\ndiagnostics. Despite the limitations of NISQ devices, this proof-of-concept\ndemonstrates the transformative potential of quantum-enhanced learning, paving\nthe way for scalable and precise diagnostic tools in healthcare and signal\nprocessing.\n","authors":["Kuan-Cheng Chen","Yi-Tien Li","Tai-Yu Li","Chen-Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2409.08584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08581v1","updated":"2024-09-13T07:00:18Z","published":"2024-09-13T07:00:18Z","title":"Learning Short Codes for Fading Channels with No or Receiver-Only\n Channel State Information","summary":" In next-generation wireless networks, low latency often necessitates\nshort-length codewords that either do not use channel state information (CSI)\nor rely solely on CSI at the receiver (CSIR). Gaussian codes that achieve\ncapacity for AWGN channels may be unsuitable for these no-CSI and CSIR-only\ncases. In this work, we design short-length codewords for these cases using an\nautoencoder architecture. From the designed codes, we observe the following: In\nthe no-CSI case, the learned codes are mutually orthogonal when the\ndistribution of the real and imaginary parts of the fading random variable has\nsupport over the entire real line. However, when the support is limited to the\nnon-negative real line, the codes are not mutually orthogonal. For the\nCSIR-only case, deep learning-based codes designed for AWGN channels perform\nworse in fading channels with optimal coherent detection compared to codes\nspecifically designed for fading channels with CSIR, where the autoencoder\njointly learns encoding, coherent combining, and decoding. In both no-CSI and\nCSIR-only cases, the codes perform at least as well as or better than classical\ncodes of the same block length.\n","authors":["Rishabh Sharad Pomaje","Rajshekhar V Bhat"],"pdf_url":"https://arxiv.org/pdf/2409.08581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15469v2","updated":"2024-09-13T06:59:18Z","published":"2023-12-24T12:28:07Z","title":"Efficient Estimation of the Central Mean Subspace via Smoothed Gradient\n Outer Products","summary":" We consider the problem of sufficient dimension reduction (SDR) for\nmulti-index models. The estimators of the central mean subspace in prior works\neither have slow (non-parametric) convergence rates, or rely on stringent\ndistributional conditions (e.g., the covariate distribution $P_{\\mathbf{X}}$\nbeing elliptical symmetric). In this paper, we show that a fast parametric\nconvergence rate of form $C_d \\cdot n^{-1/2}$ is achievable via estimating the\n\\emph{expected smoothed gradient outer product}, for a general class of\ndistribution $P_{\\mathbf{X}}$ admitting Gaussian or heavier distributions. When\nthe link function is a polynomial with a degree of at most $r$ and\n$P_{\\mathbf{X}}$ is the standard Gaussian, we show that the prefactor depends\non the ambient dimension $d$ as $C_d \\propto d^r$.\n","authors":["Gan Yuan","Mingyue Xu","Samory Kpotufe","Daniel Hsu"],"pdf_url":"https://arxiv.org/pdf/2312.15469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08580v1","updated":"2024-09-13T06:59:10Z","published":"2024-09-13T06:59:10Z","title":"Molecular Graph Representation Learning via Structural Similarity\n Information","summary":" Graph Neural Networks (GNNs) have been widely employed for feature\nrepresentation learning in molecular graphs. Therefore, it is crucial to\nenhance the expressiveness of feature representation to ensure the\neffectiveness of GNNs. However, a significant portion of current research\nprimarily focuses on the structural features within individual molecules, often\noverlooking the structural similarity between molecules, which is a crucial\naspect encapsulating rich information on the relationship between molecular\nproperties and structural characteristics. Thus, these approaches fail to\ncapture the rich semantic information at the molecular structure level. To\nbridge this gap, we introduce the \\textbf{Molecular Structural Similarity Motif\nGNN (MSSM-GNN)}, a novel molecular graph representation learning method that\ncan capture structural similarity information among molecules from a global\nperspective. In particular, we propose a specially designed graph that\nleverages graph kernel algorithms to represent the similarity between molecules\nquantitatively. Subsequently, we employ GNNs to learn feature representations\nfrom molecular graphs, aiming to enhance the accuracy of property prediction by\nincorporating additional molecular representation information. Finally, through\na series of experiments conducted on both small-scale and large-scale molecular\ndatasets, we demonstrate that our model consistently outperforms eleven\nstate-of-the-art baselines. The codes are available at\nhttps://github.com/yaoyao-yaoyao-cell/MSSM-GNN.\n","authors":["Chengyu Yao","Hong Huang","Hang Gao","Fengge Wu","Haiming Chen","Junsuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.08580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05693v2","updated":"2024-09-13T06:57:01Z","published":"2024-07-08T07:47:30Z","title":"Sub-SA: Strengthen In-context Learning via Submodular Selective\n Annotation","summary":" In-context learning (ICL) leverages in-context examples as prompts for the\npredictions of Large Language Models (LLMs). These prompts play a crucial role\nin achieving strong performance. However, the selection of suitable prompts\nfrom a large pool of labeled examples often entails significant annotation\ncosts. To address this challenge, we propose Sub-SA (Submodular Selective\nAnnotation), a submodule-based selective annotation method. The aim of Sub-SA\nis to reduce annotation costs while improving the quality of in-context\nexamples and minimizing the time consumption of the selection process. In\nSub-SA, we design a submodular function that facilitates effective subset\nselection for annotation and demonstrates the characteristics of monotonically\nand submodularity from the theoretical perspective. Specifically, we propose\nRPR (Reward and Penalty Regularization) to better balance the diversity and\nrepresentativeness of the unlabeled dataset attributed to a reward term and a\npenalty term, respectively. Consequently, the selection for annotations can be\neffectively addressed with a simple yet effective greedy search algorithm based\non the submodular function. Finally, we apply the similarity prompt retrieval\nto get the examples for ICL.\n","authors":["Jian Qian","Miao Sun","Sifan Zhou","Ziyu Zhao","Ruizhi Hun","Patrick Chiang"],"pdf_url":"https://arxiv.org/pdf/2407.05693v2.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2311.15327v5","updated":"2024-09-13T06:53:57Z","published":"2023-11-26T15:11:17Z","title":"FRAC-Q-Learning: A Reinforcement Learning with Boredom Avoidance\n Processes for Social Robots","summary":" The reinforcement learning algorithms have often been applied to social\nrobots. However, most reinforcement learning algorithms were not optimized for\nthe use of social robots, and consequently they may bore users. We proposed a\nnew reinforcement learning method specialized for the social robot, the\nFRAC-Q-learning, that can avoid user boredom. The proposed algorithm consists\nof a forgetting process in addition to randomizing and categorizing processes.\nThis study evaluated interest and boredom hardness scores of the\nFRAC-Q-learning by a comparison with the traditional Q-learning. The\nFRAC-Q-learning showed significantly higher trend of interest score, and\nindicated significantly harder to bore users compared to the traditional\nQ-learning. Therefore, the FRAC-Q-learning can contribute to develop a social\nrobot that will not bore users. The proposed algorithm has a potential to apply\nfor Web-based communication and educational systems. This paper presents the\nentire process, detailed implementation and a detailed evaluation method of the\nof the FRAC-Q-learning for the first time.\n","authors":["Akinari Onishi"],"pdf_url":"https://arxiv.org/pdf/2311.15327v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08570v1","updated":"2024-09-13T06:40:56Z","published":"2024-09-13T06:40:56Z","title":"Batch Ensemble for Variance Dependent Regret in Stochastic Bandits","summary":" Efficiently trading off exploration and exploitation is one of the key\nchallenges in online Reinforcement Learning (RL). Most works achieve this by\ncarefully estimating the model uncertainty and following the so-called\noptimistic model. Inspired by practical ensemble methods, in this work we\npropose a simple and novel batch ensemble scheme that provably achieves\nnear-optimal regret for stochastic Multi-Armed Bandits (MAB). Crucially, our\nalgorithm has just a single parameter, namely the number of batches, and its\nvalue does not depend on distributional properties such as the scale and\nvariance of the losses. We complement our theoretical results by demonstrating\nthe effectiveness of our algorithm on synthetic benchmarks.\n","authors":["Asaf Cassel","Orin Levy","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2409.08570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13781v4","updated":"2024-09-13T06:33:53Z","published":"2023-09-25T00:16:43Z","title":"Explainable Machine Learning for ICU Readmission Prediction","summary":" The intensive care unit (ICU) comprises a complex hospital environment, where\ndecisions made by clinicians have a high level of risk for the patients' lives.\nA comprehensive care pathway must then be followed to reduce p complications.\nUncertain, competing and unplanned aspects within this environment increase the\ndifficulty in uniformly implementing the care pathway. Readmission contributes\nto this pathway's difficulty, occurring when patients are admitted again to the\nICU in a short timeframe, resulting in high mortality rates and high resource\nutilisation. Several works have tried to predict readmission through patients'\nmedical information. Although they have some level of success while predicting\nreadmission, those works do not properly assess, characterise and understand\nreadmission prediction. This work proposes a standardised and explainable\nmachine learning pipeline to model patient readmission on a multicentric\ndatabase (i.e., the eICU cohort with 166,355 patients, 200,859 admissions and\n6,021 readmissions) while validating it on monocentric (i.e., the MIMIC IV\ncohort with 382,278 patients, 523,740 admissions and 5,984 readmissions) and\nmulticentric settings. Our machine learning pipeline achieved predictive\nperformance in terms of the area of the receiver operating characteristic curve\n(AUC) up to 0.7 with a Random Forest classification model, yielding an overall\ngood calibration and consistency on validation sets. From explanations provided\nby the constructed models, we could also derive a set of insightful\nconclusions, primarily on variables related to vital signs and blood tests\n(e.g., albumin, blood urea nitrogen and hemoglobin levels), demographics (e.g.,\nage, and admission height and weight), and ICU-associated variables (e.g., unit\ntype). These insights provide an invaluable source of information during\nclinicians' decision-making while discharging ICU patients.\n","authors":["Alex G. C. de Sá","Daniel Gould","Anna Fedyukova","Mitchell Nicholas","Lucy Dockrell","Calvin Fletcher","David Pilcher","Daniel Capurro","David B. Ascher","Khaled El-Khawas","Douglas E. V. Pires"],"pdf_url":"https://arxiv.org/pdf/2309.13781v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08563v1","updated":"2024-09-13T06:33:41Z","published":"2024-09-13T06:33:41Z","title":"Second-order difference subspace","summary":" Subspace representation is a fundamental technique in various fields of\nmachine learning. Analyzing a geometrical relationship among multiple subspaces\nis essential for understanding subspace series' temporal and/or spatial\ndynamics. This paper proposes the second-order difference subspace, a\nhigher-order extension of the first-order difference subspace between two\nsubspaces that can analyze the geometrical difference between them. As a\npreliminary for that, we extend the definition of the first-order difference\nsubspace to the more general setting that two subspaces with different\ndimensions have an intersection. We then define the second-order difference\nsubspace by combining the concept of first-order difference subspace and\nprincipal component subspace (Karcher mean) between two subspaces, motivated by\nthe second-order central difference method. We can understand that the\nfirst/second-order difference subspaces correspond to the velocity and\nacceleration of subspace dynamics from the viewpoint of a geodesic on a\nGrassmann manifold. We demonstrate the validity and naturalness of our\nsecond-order difference subspace by showing numerical results on two\napplications: temporal shape analysis of a 3D object and time series analysis\nof a biometric signal.\n","authors":["Kazuhiro Fukui","Pedro H. V. Valois","Lincon Souza","Takumi Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2409.08563v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2409.08558v1","updated":"2024-09-13T06:24:18Z","published":"2024-09-13T06:24:18Z","title":"Fair CoVariance Neural Networks","summary":" Covariance-based data processing is widespread across signal processing and\nmachine learning applications due to its ability to model data\ninterconnectivities and dependencies. However, harmful biases in the data may\nbecome encoded in the sample covariance matrix and cause data-driven methods to\ntreat different subpopulations unfairly. Existing works such as fair principal\ncomponent analysis (PCA) mitigate these effects, but remain unstable in low\nsample regimes, which in turn may jeopardize the fairness goal. To address both\nbiases and instability, we propose Fair coVariance Neural Networks (FVNNs),\nwhich perform graph convolutions on the covariance matrix for both fair and\naccurate predictions. Our FVNNs provide a flexible model compatible with\nseveral existing bias mitigation techniques. In particular, FVNNs allow for\nmitigating the bias in two ways: first, they operate on fair covariance\nestimates that remove biases from their principal components; second, they are\ntrained in an end-to-end fashion via a fairness regularizer in the loss\nfunction so that the model parameters are tailored to solve the task directly\nin a fair manner. We prove that FVNNs are intrinsically fairer than analogous\nPCA approaches thanks to their stability in low sample regimes. We validate the\nrobustness and fairness of our model on synthetic and real-world data,\nshowcasing the flexibility of FVNNs along with the tradeoff between fair and\naccurate performance.\n","authors":["Andrea Cavallo","Madeline Navarro","Santiago Segarra","Elvin Isufi"],"pdf_url":"https://arxiv.org/pdf/2409.08558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08551v1","updated":"2024-09-13T06:10:54Z","published":"2024-09-13T06:10:54Z","title":"Think Twice Before You Act: Improving Inverse Problem Solving With MCMC","summary":" Recent studies demonstrate that diffusion models can serve as a strong prior\nfor solving inverse problems. A prominent example is Diffusion Posterior\nSampling (DPS), which approximates the posterior distribution of data given the\nmeasure using Tweedie's formula. Despite the merits of being versatile in\nsolving various inverse problems without re-training, the performance of DPS is\nhindered by the fact that this posterior approximation can be inaccurate\nespecially for high noise levels. Therefore, we propose \\textbf{D}iffusion\n\\textbf{P}osterior \\textbf{MC}MC (\\textbf{DPMC}), a novel inference algorithm\nbased on Annealed MCMC to solve inverse problems with pretrained diffusion\nmodels. We define a series of intermediate distributions inspired by the\napproximated conditional distributions used by DPS. Through annealed MCMC\nsampling, we encourage the samples to follow each intermediate distribution\nmore closely before moving to the next distribution at a lower noise level, and\ntherefore reduce the accumulated error along the path. We test our algorithm in\nvarious inverse problems, including super resolution, Gaussian deblurring,\nmotion deblurring, inpainting, and phase retrieval. Our algorithm outperforms\nDPS with less number of evaluations across nearly all tasks, and is competitive\namong existing approaches.\n","authors":["Yaxuan Zhu","Zehao Dou","Haoxin Zheng","Yasi Zhang","Ying Nian Wu","Ruiqi Gao"],"pdf_url":"https://arxiv.org/pdf/2409.08551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08544v1","updated":"2024-09-13T05:39:00Z","published":"2024-09-13T05:39:00Z","title":"Causal GNNs: A GNN-Driven Instrumental Variable Approach for Causal\n Inference in Networks","summary":" As network data applications continue to expand, causal inference within\nnetworks has garnered increasing attention. However, hidden confounders\ncomplicate the estimation of causal effects. Most methods rely on the strong\nignorability assumption, which presumes the absence of hidden confounders-an\nassumption that is both difficult to validate and often unrealistic in\npractice. To address this issue, we propose CgNN, a novel approach that\nleverages network structure as instrumental variables (IVs), combined with\ngraph neural networks (GNNs) and attention mechanisms, to mitigate hidden\nconfounder bias and improve causal effect estimation. By utilizing network\nstructure as IVs, we reduce confounder bias while preserving the correlation\nwith treatment. Our integration of attention mechanisms enhances robustness and\nimproves the identification of important nodes. Validated on two real-world\ndatasets, our results demonstrate that CgNN effectively mitigates hidden\nconfounder bias and offers a robust GNN-driven IV framework for causal\ninference in complex network data.\n","authors":["Xiaojing Du","Feiyu Yang","Wentao Gao","Xiongren Chen"],"pdf_url":"https://arxiv.org/pdf/2409.08544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09632v3","updated":"2024-09-13T05:34:14Z","published":"2024-08-19T01:30:14Z","title":"MoDeGPT: Modular Decomposition for Large Language Model Compression","summary":" Large Language Models (LLMs) have reshaped the landscape of artificial\nintelligence by demonstrating exceptional performance across various tasks.\nHowever, substantial computational requirements make their deployment\nchallenging on devices with limited resources. Recently, compression methods\nusing low-rank matrix techniques have shown promise, yet these often lead to\ndegraded accuracy or introduce significant overhead in parameters and inference\nlatency. This paper introduces \\textbf{Mo}dular \\textbf{De}composition\n(MoDeGPT), a novel structured compression framework that does not need recovery\nfine-tuning while resolving the above drawbacks. MoDeGPT partitions the\nTransformer block into modules comprised of matrix pairs and reduces the hidden\ndimensions via reconstructing the module-level outputs. MoDeGPT is developed\nbased on a theoretical framework that utilizes three well-established matrix\ndecomposition algorithms -- Nystr\\\"om approximation, CR decomposition, and SVD\n-- and applies them to our redefined transformer modules. Our comprehensive\nexperiments show MoDeGPT, without backward propagation, matches or surpasses\nprevious structured compression methods that rely on gradient information, and\nsaves 98% of compute costs on compressing a 13B model. On \\textsc{Llama}-2/3\nand OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30%\ncompression rates. Moreover, the compression can be done on a single GPU within\na few hours and increases the inference throughput by up to 46%.\n","authors":["Chi-Heng Lin","Shangqian Gao","James Seale Smith","Abhishek Patel","Shikhar Tuli","Yilin Shen","Hongxia Jin","Yen-Chang Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.09632v3.pdf","comment":"31 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.07825v2","updated":"2024-09-13T05:33:47Z","published":"2024-09-12T08:15:39Z","title":"A Comprehensive Survey on Deep Multimodal Learning with Missing Modality","summary":" During multimodal model training and reasoning, data samples may miss certain\nmodalities and lead to compromised model performance due to sensor limitations,\ncost constraints, privacy concerns, data loss, and temporal and spatial\nfactors. This survey provides an overview of recent progress in Multimodal\nLearning with Missing Modality (MLMM), focusing on deep learning techniques. It\nis the first comprehensive survey that covers the historical background and the\ndistinction between MLMM and standard multimodal learning setups, followed by a\ndetailed analysis of current MLMM methods, applications, and datasets,\nconcluding with a discussion about challenges and potential future directions\nin the field.\n","authors":["Renjie Wu","Hu Wang","Hsiang-Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2409.07825v2.pdf","comment":"Work in progress; open to discussion; planning to submit to ACM CSUR\n in September"},{"id":"http://arxiv.org/abs/2402.16158v2","updated":"2024-09-13T05:18:58Z","published":"2024-02-25T17:37:53Z","title":"Distribution-Free Fair Federated Learning with Small Samples","summary":" As federated learning gains increasing importance in real-world applications\ndue to its capacity for decentralized data training, addressing fairness\nconcerns across demographic groups becomes critically important. However, most\nexisting machine learning algorithms for ensuring fairness are designed for\ncentralized data environments and generally require large-sample and\ndistributional assumptions, underscoring the urgent need for fairness\ntechniques adapted for decentralized and heterogeneous systems with\nfinite-sample and distribution-free guarantees. To address this issue, this\npaper introduces FedFaiREE, a post-processing algorithm developed specifically\nfor distribution-free fair learning in decentralized settings with small\nsamples. Our approach accounts for unique challenges in decentralized\nenvironments, such as client heterogeneity, communication costs, and small\nsample sizes. We provide rigorous theoretical guarantees for both fairness and\naccuracy, and our experimental results further provide robust empirical\nvalidation for our proposed method.\n","authors":["Qichuan Yin","Zexian Wang","Junzhou Huang","Huaxiu Yao","Linjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.16158v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06190v2","updated":"2024-09-13T05:01:02Z","published":"2024-09-10T03:41:10Z","title":"Multi-Source Music Generation with Latent Diffusion","summary":" Most music generation models directly generate a single music mixture. To\nallow for more flexible and controllable generation, the Multi-Source Diffusion\nModel (MSDM) has been proposed to model music as a mixture of multiple\ninstrumental sources (e.g. piano, drums, bass, and guitar). Its goal is to use\none single diffusion model to generate mutually-coherent music sources, that\nare then mixed to form the music. Despite its capabilities, MSDM is unable to\ngenerate music with rich melodies and often generates empty sounds. Its\nwaveform diffusion approach also introduces significant Gaussian noise\nartifacts that compromise audio quality. In response, we introduce a\nMulti-Source Latent Diffusion Model (MSLDM) that employs Variational\nAutoencoders (VAEs) to encode each instrumental source into a distinct latent\nrepresentation. By training a VAE on all music sources, we efficiently capture\neach source's unique characteristics in a \"source latent.\" The source latents\nare concatenated and our diffusion model learns this joint latent space. This\napproach significantly enhances the total and partial generation of music by\nleveraging the VAE's latent compression and noise-robustness. The compressed\nsource latent also facilitates more efficient generation. Subjective listening\ntests and Frechet Audio Distance (FAD) scores confirm that our model\noutperforms MSDM, showcasing its practical and enhanced applicability in music\ngeneration systems. We also emphasize that modeling sources is more effective\nthan direct music mixture modeling. Codes and models are available at\nhttps://github.com/XZWY/MSLDM. Demos are available at\nhttps://xzwy.github.io/MSLDMDemo/.\n","authors":["Zhongweiyang Xu","Debottam Dutta","Yu-Lin Wei","Romit Roy Choudhury"],"pdf_url":"https://arxiv.org/pdf/2409.06190v2.pdf","comment":"ICASSP 2025 in Submission"},{"id":"http://arxiv.org/abs/2409.08538v1","updated":"2024-09-13T04:59:35Z","published":"2024-09-13T04:59:35Z","title":"An Efficient Privacy-aware Split Learning Framework for Satellite\n Communications","summary":" In the rapidly evolving domain of satellite communications, integrating\nadvanced machine learning techniques, particularly split learning, is crucial\nfor enhancing data processing and model training efficiency across satellites,\nspace stations, and ground stations. Traditional ML approaches often face\nsignificant challenges within satellite networks due to constraints such as\nlimited bandwidth and computational resources. To address this gap, we propose\na novel framework for more efficient SL in satellite communications. Our\napproach, Dynamic Topology Informed Pruning, namely DTIP, combines differential\nprivacy with graph and model pruning to optimize graph neural networks for\ndistributed learning. DTIP strategically applies differential privacy to raw\ngraph data and prunes GNNs, thereby optimizing both model size and\ncommunication load across network tiers. Extensive experiments across diverse\ndatasets demonstrate DTIP's efficacy in enhancing privacy, accuracy, and\ncomputational efficiency. Specifically, on Amazon2M dataset, DTIP maintains an\naccuracy of 0.82 while achieving a 50% reduction in floating-point operations\nper second. Similarly, on ArXiv dataset, DTIP achieves an accuracy of 0.85\nunder comparable conditions. Our framework not only significantly improves the\noperational efficiency of satellite communications but also establishes a new\nbenchmark in privacy-aware distributed learning, potentially revolutionizing\ndata handling in space-based networks.\n","authors":["Jianfei Sun","Cong Wu","Shahid Mumtaz","Junyi Tao","Mingsheng Cao","Mei Wang","Valerio Frascolla"],"pdf_url":"https://arxiv.org/pdf/2409.08538v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.14515v2","updated":"2024-09-13T04:25:37Z","published":"2024-08-25T11:33:52Z","title":"A Joint Learning Model with Variational Interaction for Multilingual\n Program Translation","summary":" Programs implemented in various programming languages form the foundation of\nsoftware applications. To alleviate the burden of program migration and\nfacilitate the development of software systems, automated program translation\nacross languages has garnered significant attention. Previous approaches\nprimarily focus on pairwise translation paradigms, learning translation between\npairs of languages using bilingual parallel data. However, parallel data is\ndifficult to collect for some language pairs, and the distribution of program\nsemantics across languages can shift, posing challenges for pairwise program\ntranslation. In this paper, we argue that jointly learning a unified model to\ntranslate code across multiple programming languages is superior to separately\nlearning from bilingual parallel data. We propose Variational Interaction for\nMultilingual Program Translation~(VIM-PT), a disentanglement-based generative\napproach that jointly trains a unified model for multilingual program\ntranslation across multiple languages. VIM-PT disentangles code into\nlanguage-shared and language-specific features, using variational inference and\ninteraction information with a novel lower bound, then achieves program\ntranslation through conditional generation. VIM-PT demonstrates four\nadvantages: 1) captures language-shared information more accurately from\nvarious implementations and improves the quality of multilingual program\ntranslation, 2) mines and leverages the capability of non-parallel data, 3)\naddresses the distribution shift of program semantics across languages, 4) and\nserves as a unified model, reducing deployment complexity.\n","authors":["Yali Du","Hui Sun","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2408.14515v2.pdf","comment":"Accepted by the 39th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2024)"},{"id":"http://arxiv.org/abs/2409.08530v1","updated":"2024-09-13T04:23:54Z","published":"2024-09-13T04:23:54Z","title":"Integration of Mamba and Transformer -- MAT for Long-Short Range Time\n Series Forecasting with Application to Weather Dynamics","summary":" Long-short range time series forecasting is essential for predicting future\ntrends and patterns over extended periods. While deep learning models such as\nTransformers have made significant strides in advancing time series\nforecasting, they often encounter difficulties in capturing long-term\ndependencies and effectively managing sparse semantic features. The state-space\nmodel, Mamba, addresses these issues through its adept handling of selective\ninput and parallel computing, striking a balance between computational\nefficiency and prediction accuracy. This article examines the advantages and\ndisadvantages of both Mamba and Transformer models, and introduces a combined\napproach, MAT, which leverages the strengths of each model to capture unique\nlong-short range dependencies and inherent evolutionary patterns in\nmultivariate time series. Specifically, MAT harnesses the long-range dependency\ncapabilities of Mamba and the short-range characteristics of Transformers.\nExperimental results on benchmark weather datasets demonstrate that MAT\noutperforms existing comparable methods in terms of prediction accuracy,\nscalability, and memory efficiency.\n","authors":["Wenqing Zhang","Junming Huang","Ruotong Wang","Changsong Wei","Wenqian Huang","Yuxin Qiao"],"pdf_url":"https://arxiv.org/pdf/2409.08530v1.pdf","comment":"6 pages, 4 figures, to be presented at the 5th International\n Conference on Electrical, Communication and Computer Engineering (ICECCE)"},{"id":"http://arxiv.org/abs/2407.04211v2","updated":"2024-09-13T03:58:20Z","published":"2024-07-05T01:47:20Z","title":"TimeLDM: Latent Diffusion Model for Unconditional Time Series Generation","summary":" Time series generation is a crucial research topic in the area of\ndecision-making systems, which can be particularly important in domains like\nautonomous driving, healthcare, and, notably, robotics. Recent approaches focus\non learning in the data space to model time series information. However, the\ndata space often contains limited observations and noisy features. In this\npaper, we propose TimeLDM, a novel latent diffusion model for high-quality time\nseries generation. TimeLDM is composed of a variational autoencoder that\nencodes time series into an informative and smoothed latent content and a\nlatent diffusion model operating in the latent space to generate latent\ninformation. We evaluate the ability of our method to generate synthetic time\nseries with simulated and real-world datasets and benchmark the performance\nagainst existing state-of-the-art methods. Qualitatively and quantitatively, we\nfind that the proposed TimeLDM persistently delivers high-quality generated\ntime series. For example, TimeLDM achieves new state-of-the-art results on the\nsimulated benchmarks and an average improvement of 55% in Discriminative score\nwith all benchmarks. Further studies demonstrate that our method yields more\nrobust outcomes across various lengths of time series data generation.\nEspecially, for the Context-FID score and Discriminative score, TimeLDM\nrealizes significant improvements of 80% and 50%, respectively. The code will\nbe released after publication.\n","authors":["Jian Qian","Bingyu Xie","Biao Wan","Minhao Li","Miao Sun","Patrick Yin Chiang"],"pdf_url":"https://arxiv.org/pdf/2407.04211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08522v1","updated":"2024-09-13T03:45:10Z","published":"2024-09-13T03:45:10Z","title":"MAPX: An explainable model-agnostic framework for the detection of false\n information on social media networks","summary":" The automated detection of false information has become a fundamental task in\ncombating the spread of \"fake news\" on online social media networks (OSMN) as\nit reduces the need for manual discernment by individuals. In the literature,\nleveraging various content or context features of OSMN documents have been\nfound useful. However, most of the existing detection models often utilise\nthese features in isolation without regard to the temporal and dynamic changes\noft-seen in reality, thus, limiting the robustness of the models. Furthermore,\nthere has been little to no consideration of the impact of the quality of\ndocuments' features on the trustworthiness of the final prediction. In this\npaper, we introduce a novel model-agnostic framework, called MAPX, which allows\nevidence based aggregation of predictions from existing models in an\nexplainable manner. Indeed, the developed aggregation method is adaptive,\ndynamic and considers the quality of OSMN document features. Further, we\nperform extensive experiments on benchmarked fake news datasets to demonstrate\nthe effectiveness of MAPX using various real-world data quality scenarios. Our\nempirical results show that the proposed framework consistently outperforms all\nstate-of-the-art models evaluated. For reproducibility, a demo of MAPX is\navailable at \\href{https://github.com/SCondran/MAPX_framework}{this link}\n","authors":["Sarah Condran","Michael Bewong","Selasi Kwashie","Md Zahidul Islam","Irfan Altas","Joshua Condran"],"pdf_url":"https://arxiv.org/pdf/2409.08522v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.08521v1","updated":"2024-09-13T03:43:30Z","published":"2024-09-13T03:43:30Z","title":"Optimal Classification-based Anomaly Detection with Neural Networks:\n Theory and Practice","summary":" Anomaly detection is an important problem in many application areas, such as\nnetwork security. Many deep learning methods for unsupervised anomaly detection\nproduce good empirical performance but lack theoretical guarantees. By casting\nanomaly detection into a binary classification problem, we establish\nnon-asymptotic upper bounds and a convergence rate on the excess risk on\nrectified linear unit (ReLU) neural networks trained on synthetic anomalies.\nOur convergence rate on the excess risk matches the minimax optimal rate in the\nliterature. Furthermore, we provide lower and upper bounds on the number of\nsynthetic anomalies that can attain this optimality. For practical\nimplementation, we relax some conditions to improve the search for the\nempirical risk minimizer, which leads to competitive performance to other\nclassification-based methods for anomaly detection. Overall, our work provides\nthe first theoretical guarantees of unsupervised neural network-based anomaly\ndetectors and empirical insights on how to design them well.\n","authors":["Tian-Yi Zhou","Matthew Lau","Jizhou Chen","Wenke Lee","Xiaoming Huo"],"pdf_url":"https://arxiv.org/pdf/2409.08521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08518v1","updated":"2024-09-13T03:34:37Z","published":"2024-09-13T03:34:37Z","title":"Anytime Continual Learning for Open Vocabulary Classification","summary":" We propose an approach for anytime continual learning (AnytimeCL) for open\nvocabulary image classification. The AnytimeCL problem aims to break away from\nbatch training and rigid models by requiring that a system can predict any set\nof labels at any time and efficiently update and improve when receiving one or\nmore training samples at any time. Despite the challenging goal, we achieve\nsubstantial improvements over recent methods. We propose a dynamic weighting\nbetween predictions of a partially fine-tuned model and a fixed open vocabulary\nmodel that enables continual improvement when training samples are available\nfor a subset of a task's labels. We also propose an attention-weighted PCA\ncompression of training features that reduces storage and computation with\nlittle impact to model accuracy. Our methods are validated with experiments\nthat test flexibility of learning and inference. Code is available at\nhttps://github.com/jessemelpolio/AnytimeCL.\n","authors":["Zhen Zhu","Yiming Gong","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2409.08518v1.pdf","comment":"To appear at ECCV 2024 as Oral presentation"},{"id":"http://arxiv.org/abs/2407.12665v2","updated":"2024-09-13T03:07:37Z","published":"2024-07-17T15:48:39Z","title":"Patch-Level Training for Large Language Models","summary":" As Large Language Models (LLMs) achieve remarkable progress in language\nunderstanding and generation, their training efficiency has become a critical\nconcern. Traditionally, LLMs are trained to predict the next token in a\nsequence. Despite the success of token-level training, it suffers from\nconsiderable computational costs due to the need to process an extensive number\nof tokens. To mitigate this issue, this paper introduces patch-level training\nfor LLMs, which reduces the sequence length by compressing multiple tokens into\na single patch. During patch-level training, we feed the language model shorter\nsequences of patches and train it to predict the next patch, thereby processing\nthe majority of the training data at a significantly reduced computational\ncost. Following this, the model continues token-level training on the remaining\ntraining data to align with the inference mode. Experiments on a diverse range\nof models (370M-2.7B parameters) demonstrate that patch-level training can\nreduce overall computational costs to 0.5$\\times$, without compromising the\nmodel performance compared to token-level training. Source code:\n\\url{https://github.com/shaochenze/PatchTrain}.\n","authors":["Chenze Shao","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.12665v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08503v1","updated":"2024-09-13T02:55:22Z","published":"2024-09-13T02:55:22Z","title":"Enhancing Privacy in ControlNet and Stable Diffusion via Split Learning","summary":" With the emerging trend of large generative models, ControlNet is introduced\nto enable users to fine-tune pre-trained models with their own data for various\nuse cases. A natural question arises: how can we train ControlNet models while\nensuring users' data privacy across distributed devices? Exploring different\ndistributed training schemes, we find conventional federated learning and split\nlearning unsuitable. Instead, we propose a new distributed learning structure\nthat eliminates the need for the server to send gradients back. Through a\ncomprehensive evaluation of existing threats, we discover that in the context\nof training ControlNet with split learning, most existing attacks are\nineffective, except for two mentioned in previous literature. To counter these\nthreats, we leverage the properties of diffusion models and design a new\ntimestep sampling policy during forward processes. We further propose a\nprivacy-preserving activation function and a method to prevent private text\nprompts from leaving clients, tailored for image generation with diffusion\nmodels. Our experimental results demonstrate that our algorithms and systems\ngreatly enhance the efficiency of distributed training for ControlNet while\nensuring users' data privacy without compromising image generation quality.\n","authors":["Dixi Yao"],"pdf_url":"https://arxiv.org/pdf/2409.08503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16029v2","updated":"2024-09-13T02:51:18Z","published":"2024-08-28T03:43:01Z","title":"Meta-Learn Unimodal Signals with Weak Supervision for Multimodal\n Sentiment Analysis","summary":" Multimodal sentiment analysis aims to effectively integrate information from\nvarious sources to infer sentiment, where in many cases there are no\nannotations for unimodal labels. Therefore, most works rely on multimodal\nlabels for training. However, there exists the noisy label problem for the\nlearning of unimodal signals as multimodal annotations are not always the ideal\nsubstitutes for the unimodal ones, failing to achieve finer optimization for\nindividual modalities. In this paper, we explore the learning of unimodal\nlabels under the weak supervision from the annotated multimodal labels.\nSpecifically, we propose a novel meta uni-label generation (MUG) framework to\naddress the above problem, which leverages the available multimodal labels to\nlearn the corresponding unimodal labels by the meta uni-label correction\nnetwork (MUCN). We first design a contrastive-based projection module to bridge\nthe gap between unimodal and multimodal representations, so as to use\nmultimodal annotations to guide the learning of MUCN. Afterwards, we propose\nunimodal and multimodal denoising tasks to train MUCN with explicit supervision\nvia a bi-level optimization strategy. We then jointly train unimodal and\nmultimodal learning tasks to extract discriminative unimodal features for\nmultimodal inference. Experimental results suggest that MUG outperforms\ncompetitive baselines and can learn accurate unimodal labels.\n","authors":["Sijie Mai","Yu Zhao","Ying Zeng","Jianhua Yao","Haifeng Hu"],"pdf_url":"https://arxiv.org/pdf/2408.16029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04104v3","updated":"2024-09-13T02:48:33Z","published":"2024-08-07T21:45:01Z","title":"Hardware-Assisted Virtualization of Neural Processing Units for Cloud\n Platforms","summary":" Cloud platforms today have been deploying hardware accelerators like neural\nprocessing units (NPUs) for powering machine learning (ML) inference services.\nTo maximize the resource utilization while ensuring reasonable quality of\nservice, a natural approach is to virtualize NPUs for efficient resource\nsharing for multi-tenant ML services. However, virtualizing NPUs for modern\ncloud platforms is not easy. This is not only due to the lack of system\nabstraction support for NPU hardware, but also due to the lack of architectural\nand ISA support for enabling fine-grained dynamic operator scheduling for\nvirtualized NPUs.\n We present Neu10, a holistic NPU virtualization framework. We investigate\nvirtualization techniques for NPUs across the entire software and hardware\nstack. Neu10 consists of (1) a flexible NPU abstraction called vNPU, which\nenables fine-grained virtualization of the heterogeneous compute units in a\nphysical NPU (pNPU); (2) a vNPU resource allocator that enables pay-as-you-go\ncomputing model and flexible vNPU-to-pNPU mappings for improved resource\nutilization and cost-effectiveness; (3) an ISA extension of modern NPU\narchitecture for facilitating fine-grained tensor operator scheduling for\nmultiple vNPUs. We implement Neu10 based on a production-level NPU simulator.\nOur experiments show that Neu10 improves the throughput of ML inference\nservices by up to 1.4$\\times$ and reduces the tail latency by up to\n4.6$\\times$, while improving the NPU utilization by 1.2$\\times$ on average,\ncompared to state-of-the-art NPU sharing approaches.\n","authors":["Yuqi Xue","Yiqi Liu","Lifeng Nai","Jian Huang"],"pdf_url":"https://arxiv.org/pdf/2408.04104v3.pdf","comment":"Accepted to MICRO'24"},{"id":"http://arxiv.org/abs/2406.09694v2","updated":"2024-09-13T02:46:51Z","published":"2024-06-14T03:38:40Z","title":"An Efficient Approach to Regression Problems with Tensor Neural Networks","summary":" This paper introduces a tensor neural network (TNN) to address nonparametric\nregression problems, leveraging its distinct sub-network structure to\neffectively facilitate variable separation and enhance the approximation of\ncomplex, high-dimensional functions. The TNN demonstrates superior performance\ncompared to conventional Feed-Forward Networks (FFN) and Radial Basis Function\nNetworks (RBN) in terms of both approximation accuracy and generalization\ncapacity, even with a comparable number of parameters. A significant innovation\nin our approach is the integration of statistical regression and numerical\nintegration within the TNN framework. This allows for efficient computation of\nhigh-dimensional integrals associated with the regression function and provides\ndetailed insights into the underlying data structure. Furthermore, we employ\ngradient and Laplacian analysis on the regression outputs to identify key\ndimensions influencing the predictions, thereby guiding the design of\nsubsequent experiments. These advancements make TNN a powerful tool for\napplications requiring precise high-dimensional data analysis and predictive\nmodeling.\n","authors":["Yongxin Li","Yifan Wang","Zhongshuo Lin","Hehu Xie"],"pdf_url":"https://arxiv.org/pdf/2406.09694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01607v3","updated":"2024-09-13T02:42:45Z","published":"2024-05-02T04:53:42Z","title":"Wildfire Risk Prediction: A Review","summary":" Wildfires have significant impacts on global vegetation, wildlife, and\nhumans. They destroy plant communities and wildlife habitats and contribute to\nincreased emissions of carbon dioxide, nitrogen oxides, methane, and other\npollutants. The prediction of wildfires relies on various independent variables\ncombined with regression or machine learning methods. In this technical review,\nwe describe the options for independent variables, data processing techniques,\nmodels, independent variables collinearity and importance estimation methods,\nand model performance evaluation metrics. First, we divide the independent\nvariables into 4 aspects, including climate and meteorology conditions,\nsocio-economical factors, terrain and hydrological features, and wildfire\nhistorical records. Second, preprocessing methods are described for different\nmagnitudes, different spatial-temporal resolutions, and different formats of\ndata. Third, the collinearity and importance evaluation methods of independent\nvariables are also considered. Fourth, we discuss the application of\nstatistical models, traditional machine learning models, and deep learning\nmodels in wildfire risk prediction. In this subsection, compared with other\nreviews, this manuscript particularly discusses the evaluation metrics and\nrecent advancements in deep learning methods. Lastly, addressing the\nlimitations of current research, this paper emphasizes the need for more\neffective deep learning time series forecasting algorithms, the utilization of\nthree-dimensional data including ground and trunk fuel, extraction of more\naccurate historical fire point data, and improved model evaluation metrics.\n","authors":["Zhengsen Xu","Jonathan Li","Sibo Cheng","Xue Rui","Yu Zhao","Hongjie He","Linlin Xu"],"pdf_url":"https://arxiv.org/pdf/2405.01607v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10060v3","updated":"2024-09-13T02:33:22Z","published":"2024-08-19T14:54:12Z","title":"Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with\n Texture Map-Based Weak Supervision","summary":" Facial wrinkle detection plays a crucial role in cosmetic dermatology.\nPrecise manual segmentation of facial wrinkles is challenging and\ntime-consuming, with inherent subjectivity leading to inconsistent results\namong graders. To address this issue, we propose two solutions. First, we build\nand release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an\nextension of the NVIDIA FFHQ dataset. It includes 1,000 images with human\nlabels and 50,000 images with automatically generated weak labels. This dataset\ncould serve as a foundation for the research community to develop advanced\nwrinkle detection algorithms. Second, we introduce a simple training strategy\nutilizing texture maps, applicable to various segmentation models, to detect\nwrinkles across the face. Our two-stage training strategy first pretrain models\non a large dataset with weak labels (N=50k), or masked texture maps generated\nthrough computer vision techniques, without human intervention. We then\nfinetune the models using human-labeled data (N=1k), which consists of manually\nlabeled wrinkle masks. The network takes as input a combination of RGB and\nmasked texture map of the image, comprising four channels, in finetuning. We\neffectively combine labels from multiple annotators to minimize subjectivity in\nmanual labeling. Our strategies demonstrate improved segmentation performance\nin facial wrinkle segmentation both quantitatively and visually compared to\nexisting pretraining methods. The dataset is available at\nhttps://github.com/labhai/ffhq-wrinkle-dataset.\n","authors":["Junho Moon","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.10060v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08487v1","updated":"2024-09-13T02:23:55Z","published":"2024-09-13T02:23:55Z","title":"Sub-graph Based Diffusion Model for Link Prediction","summary":" Denoising Diffusion Probabilistic Models (DDPMs) represent a contemporary\nclass of generative models with exceptional qualities in both synthesis and\nmaximizing the data likelihood. These models work by traversing a forward\nMarkov Chain where data is perturbed, followed by a reverse process where a\nneural network learns to undo the perturbations and recover the original data.\nThere have been increasing efforts exploring the applications of DDPMs in the\ngraph domain. However, most of them have focused on the generative perspective.\nIn this paper, we aim to build a novel generative model for link prediction. In\nparticular, we treat link prediction between a pair of nodes as a conditional\nlikelihood estimation of its enclosing sub-graph. With a dedicated design to\ndecompose the likelihood estimation process via the Bayesian formula, we are\nable to separate the estimation of sub-graph structure and its node features.\nSuch designs allow our model to simultaneously enjoy the advantages of\ninductive learning and the strong generalization capability. Remarkably,\ncomprehensive experiments across various datasets validate that our proposed\nmethod presents numerous advantages: (1) transferability across datasets\nwithout retraining, (2) promising generalization on limited training data, and\n(3) robustness against graph adversarial attacks.\n","authors":["Hang Li","Wei Jin","Geri Skenderi","Harry Shomer","Wenzhuo Tang","Wenqi Fan","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2409.08487v1.pdf","comment":"17 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.08482v1","updated":"2024-09-13T02:13:26Z","published":"2024-09-13T02:13:26Z","title":"Risks When Sharing LoRA Fine-Tuned Diffusion Model Weights","summary":" With the emerging trend in generative models and convenient public access to\ndiffusion models pre-trained on large datasets, users can fine-tune these\nmodels to generate images of personal faces or items in new contexts described\nby natural language. Parameter efficient fine-tuning (PEFT) such as Low Rank\nAdaptation (LoRA) has become the most common way to save memory and computation\nusage on the user end during fine-tuning. However, a natural question is\nwhether the private images used for fine-tuning will be leaked to adversaries\nwhen sharing model weights. In this paper, we study the issue of privacy\nleakage of a fine-tuned diffusion model in a practical setting, where\nadversaries only have access to model weights, rather than prompts or images\nused for fine-tuning. We design and build a variational network autoencoder\nthat takes model weights as input and outputs the reconstruction of private\nimages. To improve the efficiency of training such an autoencoder, we propose a\ntraining paradigm with the help of timestep embedding. The results give a\nsurprising answer to this research question: an adversary can generate images\ncontaining the same identities as the private images. Furthermore, we\ndemonstrate that no existing defense method, including differential\nprivacy-based methods, can preserve the privacy of private data used for\nfine-tuning a diffusion model without compromising the utility of a fine-tuned\nmodel.\n","authors":["Dixi Yao"],"pdf_url":"https://arxiv.org/pdf/2409.08482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03175v2","updated":"2024-09-13T02:11:15Z","published":"2023-08-06T18:05:39Z","title":"Adapting Machine Learning Diagnostic Models to New Populations Using a\n Small Amount of Data: Results from Clinical Neuroscience","summary":" Machine learning (ML) has shown great promise for revolutionizing a number of\nareas, including healthcare. However, it is also facing a reproducibility\ncrisis, especially in medicine. ML models that are carefully constructed from\nand evaluated on a training set might not generalize well on data from\ndifferent patient populations or acquisition instrument settings and protocols.\nWe tackle this problem in the context of neuroimaging of Alzheimer's disease\n(AD), schizophrenia (SZ) and brain aging. We develop a weighted empirical risk\nminimization approach that optimally combines data from a source group, e.g.,\nsubjects are stratified by attributes such as sex, age group, race and clinical\ncohort to make predictions on a target group, e.g., other sex, age group, etc.\nusing a small fraction (10%) of data from the target group. We apply this\nmethod to multi-source data of 15,363 individuals from 20 neuroimaging studies\nto build ML models for diagnosis of AD and SZ, and estimation of brain age. We\nfound that this approach achieves substantially better accuracy than existing\ndomain adaptation techniques: it obtains area under curve greater than 0.95 for\nAD classification, area under curve greater than 0.7 for SZ classification and\nmean absolute error less than 5 years for brain age prediction on all target\ngroups, achieving robustness to variations of scanners, protocols, and\ndemographic or clinical characteristics. In some cases, it is even better than\ntraining on all data from the target group, because it leverages the diversity\nand size of a larger training set. We also demonstrate the utility of our\nmodels for prognostic tasks such as predicting disease progression in\nindividuals with mild cognitive impairment. Critically, our brain age\nprediction models lead to new clinical insights regarding correlations with\nneurophysiological tests.\n","authors":["Rongguang Wang","Guray Erus","Pratik Chaudhari","Christos Davatzikos"],"pdf_url":"https://arxiv.org/pdf/2308.03175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08477v1","updated":"2024-09-13T02:07:20Z","published":"2024-09-13T02:07:20Z","title":"Integrating Neural Operators with Diffusion Models Improves Spectral\n Representation in Turbulence Modeling","summary":" We integrate neural operators with diffusion models to address the spectral\nlimitations of neural operators in surrogate modeling of turbulent flows. While\nneural operators offer computational efficiency, they exhibit deficiencies in\ncapturing high-frequency flow dynamics, resulting in overly smooth\napproximations. To overcome this, we condition diffusion models on neural\noperators to enhance the resolution of turbulent structures. Our approach is\nvalidated for different neural operators on diverse datasets, including a high\nReynolds number jet flow simulation and experimental Schlieren velocimetry. The\nproposed method significantly improves the alignment of predicted energy\nspectra with true distributions compared to neural operators alone.\nAdditionally, proper orthogonal decomposition analysis demonstrates enhanced\nspectral fidelity in space-time. This work establishes a new paradigm for\ncombining generative models with neural operators to advance surrogate modeling\nof turbulent systems, and it can be used in other scientific applications that\ninvolve microstructure and high-frequency content. See our project page:\nvivekoommen.github.io/NO_DM\n","authors":["Vivek Oommen","Aniruddha Bora","Zhen Zhang","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2409.08477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08474v1","updated":"2024-09-13T02:00:16Z","published":"2024-09-13T02:00:16Z","title":"Rethinking Meta-Learning from a Learning Lens","summary":" Meta-learning has emerged as a powerful approach for leveraging knowledge\nfrom previous tasks to solve new tasks. The mainstream methods focus on\ntraining a well-generalized model initialization, which is then adapted to\ndifferent tasks with limited data and updates. However, it pushes the model\noverfitting on the training tasks. Previous methods mainly attributed this to\nthe lack of data and used augmentations to address this issue, but they were\nlimited by sufficient training and effective augmentation strategies. In this\nwork, we focus on the more fundamental ``learning to learn'' strategy of\nmeta-learning to explore what causes errors and how to eliminate these errors\nwithout changing the environment. Specifically, we first rethink the\nalgorithmic procedure of meta-learning from a ``learning'' lens. Through\ntheoretical and empirical analyses, we find that (i) this paradigm faces the\nrisk of both overfitting and underfitting and (ii) the model adapted to\ndifferent tasks promote each other where the effect is stronger if the tasks\nare more similar. Based on this insight, we propose using task relations to\ncalibrate the optimization process of meta-learning and propose a plug-and-play\nmethod called Task Relation Learner (TRLearner) to achieve this goal.\nSpecifically, it first obtains task relation matrices from the extracted\ntask-specific meta-data. Then, it uses the obtained matrices with\nrelation-aware consistency regularization to guide optimization. Extensive\ntheoretical and empirical analyses demonstrate the effectiveness of TRLearner.\n","authors":["Jingyao Wang","Wenwen Qiang","Jiangmeng Li","Lingyu Si","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.08474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08469v1","updated":"2024-09-13T01:49:19Z","published":"2024-09-13T01:49:19Z","title":"Improved Finite-Particle Convergence Rates for Stein Variational\n Gradient Descent","summary":" We provide finite-particle convergence rates for the Stein Variational\nGradient Descent (SVGD) algorithm in the Kernel Stein Discrepancy\n($\\mathsf{KSD}$) and Wasserstein-2 metrics. Our key insight is the observation\nthat the time derivative of the relative entropy between the joint density of\n$N$ particle locations and the $N$-fold product target measure, starting from a\nregular initial distribution, splits into a dominant `negative part'\nproportional to $N$ times the expected $\\mathsf{KSD}^2$ and a smaller `positive\npart'. This observation leads to $\\mathsf{KSD}$ rates of order $1/\\sqrt{N}$,\nproviding a near optimal double exponential improvement over the recent result\nby~\\cite{shi2024finite}. Under mild assumptions on the kernel and potential,\nthese bounds also grow linearly in the dimension $d$. By adding a bilinear\ncomponent to the kernel, the above approach is used to further obtain\nWasserstein-2 convergence. For the case of `bilinear + Mat\\'ern' kernels, we\nderive Wasserstein-2 rates that exhibit a curse-of-dimensionality similar to\nthe i.i.d. setting. We also obtain marginal convergence and long-time\npropagation of chaos results for the time-averaged particle laws.\n","authors":["Krishnakumar Balasubramanian","Sayan Banerjee","Promit Ghosal"],"pdf_url":"https://arxiv.org/pdf/2409.08469v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.08466v1","updated":"2024-09-13T01:40:20Z","published":"2024-09-13T01:40:20Z","title":"Explaining Datasets in Words: Statistical Models with Natural Language\n Parameters","summary":" To make sense of massive data, we often fit simplified models and then\ninterpret the parameters; for example, we cluster the text embeddings and then\ninterpret the mean parameters of each cluster. However, these parameters are\noften high-dimensional and hard to interpret. To make model parameters directly\ninterpretable, we introduce a family of statistical models -- including\nclustering, time series, and classification models -- parameterized by natural\nlanguage predicates. For example, a cluster of text about COVID could be\nparameterized by the predicate \"discusses COVID\". To learn these statistical\nmodels effectively, we develop a model-agnostic algorithm that optimizes\ncontinuous relaxations of predicate parameters with gradient descent and\ndiscretizes them by prompting language models (LMs). Finally, we apply our\nframework to a wide range of problems: taxonomizing user chat dialogues,\ncharacterizing how they evolve across time, finding categories where one\nlanguage model is better than the other, clustering math problems based on\nsubareas, and explaining visual features in memorable images. Our framework is\nhighly versatile, applicable to both textual and visual domains, can be easily\nsteered to focus on specific properties (e.g. subareas), and explains\nsophisticated concepts that classical methods (e.g. n-gram analysis) struggle\nto produce.\n","authors":["Ruiqi Zhong","Heng Wang","Dan Klein","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2409.08466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17495v2","updated":"2024-09-13T01:05:46Z","published":"2023-12-29T07:19:42Z","title":"Integrating Chemical Language and Molecular Graph in Multimodal Fused\n Deep Learning for Drug Property Prediction","summary":" Accurately predicting molecular properties is a challenging but essential\ntask in drug discovery. Recently, many mono-modal deep learning methods have\nbeen successfully applied to molecular property prediction. However, the\ninherent limitation of mono-modal learning arises from relying solely on one\nmodality of molecular representation, which restricts a comprehensive\nunderstanding of drug molecules and hampers their resilience against data\nnoise. To overcome the limitations, we construct multimodal deep learning\nmodels to cover different molecular representations. We convert drug molecules\ninto three molecular representations, SMILES-encoded vectors, ECFP\nfingerprints, and molecular graphs. To process the modal information,\nTransformer-Encoder, bi-directional gated recurrent units (BiGRU), and graph\nconvolutional network (GCN) are utilized for feature learning respectively,\nwhich can enhance the model capability to acquire complementary and naturally\noccurring bioinformatics information. We evaluated our triple-modal model on\nsix molecule datasets. Different from bi-modal learning models, we adopt five\nfusion methods to capture the specific features and leverage the contribution\nof each modal information better. Compared with mono-modal models, our\nmultimodal fused deep learning (MMFDL) models outperform single models in\naccuracy, reliability, and resistance capability against noise. Moreover, we\ndemonstrate its generalization ability in the prediction of binding constants\nfor protein-ligand complex molecules in the refined set of PDBbind. The\nadvantage of the multimodal model lies in its ability to process diverse\nsources of data using proper models and suitable fusion methods, which would\nenhance the noise resistance of the model while obtaining data diversity.\n","authors":["Xiaohua Lu","Liangxu Xie","Lei Xu","Rongzhi Mao","Shan Chang","Xiaojun Xu"],"pdf_url":"https://arxiv.org/pdf/2312.17495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01332v2","updated":"2024-09-13T00:56:01Z","published":"2024-03-29T22:49:43Z","title":"Explaining Large Language Models Decisions with Shapley Values","summary":" The emergence of large language models (LLMs) has opened up exciting\npossibilities for simulating human behavior and cognitive processes, with\npotential applications in various domains, including marketing research and\nconsumer behavior analysis. However, the validity of utilizing LLMs as\nstand-ins for human subjects remains uncertain due to glaring divergences that\nsuggest fundamentally different underlying processes at play and the\nsensitivity of LLM responses to prompt variations. This paper presents a novel\napproach based on Shapley values from cooperative game theory to interpret LLM\nbehavior and quantify the relative contribution of each prompt component to the\nmodel's output. Through two applications - a discrete choice experiment and an\ninvestigation of cognitive biases - we demonstrate how the Shapley value method\ncan uncover what we term \"token noise\" effects, a phenomenon where LLM\ndecisions are disproportionately influenced by tokens providing minimal\ninformative content. This phenomenon raises concerns about the robustness and\ngeneralizability of insights obtained from LLMs in the context of human\nbehavior simulation. Our model-agnostic approach extends its utility to\nproprietary LLMs, providing a valuable tool for practitioners and researchers\nto strategically optimize prompts and mitigate apparent cognitive biases. Our\nfindings underscore the need for a more nuanced understanding of the factors\ndriving LLM responses before relying on them as substitutes for human subjects\nin survey settings. We emphasize the importance of researchers reporting\nresults conditioned on specific prompt templates and exercising caution when\ndrawing parallels between human behavior and LLMs.\n","authors":["Behnam Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2404.01332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08439v1","updated":"2024-09-13T00:11:09Z","published":"2024-09-13T00:11:09Z","title":"Input-to-State Stable Coupled Oscillator Networks for Closed-form\n Model-based Control in Latent Space","summary":" Even though a variety of methods (e.g., RL, MPC, LQR) have been proposed in\nthe literature, efficient and effective latent-space control of physical\nsystems remains an open challenge. A promising avenue would be to leverage\npowerful and well-understood closed-form strategies from control theory\nliterature in combination with learned dynamics, such as potential-energy\nshaping. We identify three fundamental shortcomings in existing latent-space\nmodels that have so far prevented this powerful combination: (i) they lack the\nmathematical structure of a physical system, (ii) they do not inherently\nconserve the stability properties of the real systems. Furthermore, (iii) these\nmethods do not have an invertible mapping between input and latent-space\nforcing. This work proposes a novel Coupled Oscillator Network (CON) model that\nsimultaneously tackles all these issues. More specifically, (i) we show\nanalytically that CON is a Lagrangian system - i.e., it presses well-defined\npotential and kinetic energy terms. Then, (ii) we provide formal proof of\nglobal Input-to-State stability using Lyapunov arguments. Moving to the\nexperimental side, (iii) we demonstrate that CON reaches SoA performance when\nlearning complex nonlinear dynamics of mechanical systems directly from images.\nAn additional methodological innovation contributing to achieving this third\ngoal is an approximated closed-form solution for efficient integration of\nnetwork dynamics, which eases efficient training. We tackle (iv) by\napproximating the forcing-to-input mapping with a decoder that is trained to\nreconstruct the input based on the encoded latent space force. Finally, we\nleverage these four properties and show that they enable latent-space control.\nWe use an integral-saturated PID with potential force compensation and\ndemonstrate high-quality performance on a soft robot using raw pixels as the\nonly feedback information.\n","authors":["Maximilian Stölzle","Cosimo Della Santina"],"pdf_url":"https://arxiv.org/pdf/2409.08439v1.pdf","comment":"41 pages, currently under review"},{"id":"http://arxiv.org/abs/2409.08434v1","updated":"2024-09-13T00:01:58Z","published":"2024-09-13T00:01:58Z","title":"Predictive Control and Regret Analysis of Non-Stationary MDP with\n Look-ahead Information","summary":" Policy design in non-stationary Markov Decision Processes (MDPs) is\ninherently challenging due to the complexities introduced by time-varying\nsystem transition and reward, which make it difficult for learners to determine\nthe optimal actions for maximizing cumulative future rewards. Fortunately, in\nmany practical applications, such as energy systems, look-ahead predictions are\navailable, including forecasts for renewable energy generation and demand. In\nthis paper, we leverage these look-ahead predictions and propose an algorithm\ndesigned to achieve low regret in non-stationary MDPs by incorporating such\npredictions. Our theoretical analysis demonstrates that, under certain\nassumptions, the regret decreases exponentially as the look-ahead window\nexpands. When the system prediction is subject to error, the regret does not\nexplode even if the prediction error grows sub-exponentially as a function of\nthe prediction horizon. We validate our approach through simulations,\nconfirming the efficacy of our algorithm in non-stationary environments.\n","authors":["Ziyi Zhang","Yorie Nakahira","Guannan Qu"],"pdf_url":"https://arxiv.org/pdf/2409.08434v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.08795v1","updated":"2024-09-13T12:59:39Z","published":"2024-09-13T12:59:39Z","title":"LLaQo: Towards a Query-Based Coach in Expressive Music Performance\n Assessment","summary":" Research in music understanding has extensively explored composition-level\nattributes such as key, genre, and instrumentation through advanced\nrepresentations, leading to cross-modal applications using large language\nmodels. However, aspects of musical performance such as stylistic expression\nand technique remain underexplored, along with the potential of using large\nlanguage models to enhance educational outcomes with customized feedback. To\nbridge this gap, we introduce LLaQo, a Large Language Query-based music coach\nthat leverages audio language modeling to provide detailed and formative\nassessments of music performances. We also introduce instruction-tuned\nquery-response datasets that cover a variety of performance dimensions from\npitch accuracy to articulation, as well as contextual performance understanding\n(such as difficulty and performance techniques). Utilizing AudioMAE encoder and\nVicuna-7b LLM backend, our model achieved state-of-the-art (SOTA) results in\npredicting teachers' performance ratings, as well as in identifying piece\ndifficulty and playing techniques. Textual responses from LLaQo was moreover\nrated significantly higher compared to other baseline models in a user study\nusing audio-text matching. Our proposed model can thus provide informative\nanswers to open-ended questions related to musical performance from audio data.\n","authors":["Huan Zhang","Vincent Cheung","Hayato Nishioka","Simon Dixon","Shinichi Furuya"],"pdf_url":"https://arxiv.org/pdf/2409.08795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08772v1","updated":"2024-09-13T12:30:15Z","published":"2024-09-13T12:30:15Z","title":"On the Computation of BD-Rate over a Set of Videos for Fair Assessment\n of Performance of Learned Video Codecs","summary":" The Bj{\\o}ntegaard Delta (BD) measure is widely employed to evaluate and\nquantify the variations in the rate-distortion(RD) performance across different\ncodecs. Many researchers report the average BD value over multiple videos\nwithin a dataset for different codecs. We claim that the current practice in\nthe learned video compression community of computing the average BD value over\na dataset based on the average RD curve of multiple videos can lead to\nmisleading conclusions. We show both by analysis of a simplistic case of linear\nRD curves and experimental results with two recent learned video codecs that\naveraging RD curves can lead to a single video to disproportionately influence\nthe average BD value especially when the operating bitrate range of different\ncodecs do not exactly match. Instead, we advocate for calculating the BD\nmeasure per-video basis, as commonly done by the traditional video compression\ncommunity, followed by averaging the individual BD values over videos, to\nprovide a fair comparison of learned video codecs. Our experimental results\ndemonstrate that the comparison of two recent learned video codecs is affected\nby how we evaluate the average BD measure.\n","authors":["M. Akin Yilmaz","Onur Keleş","A. Murat Tekalp"],"pdf_url":"https://arxiv.org/pdf/2409.08772v1.pdf","comment":"Submitted to IEEE ICASSP 2025"},{"id":"http://arxiv.org/abs/2309.08751v2","updated":"2024-09-13T12:06:13Z","published":"2023-09-15T20:27:47Z","title":"Diverse Neural Audio Embeddings -- Bringing Features back !","summary":" With the advent of modern AI architectures, a shift has happened towards\nend-to-end architectures. This pivot has led to neural architectures being\ntrained without domain-specific biases/knowledge, optimized according to the\ntask. We in this paper, learn audio embeddings via diverse feature\nrepresentations, in this case, domain-specific. For the case of audio\nclassification over hundreds of categories of sound, we learn robust separate\nembeddings for diverse audio properties such as pitch, timbre, and neural\nrepresentation, along with also learning it via an end-to-end architecture. We\nobserve handcrafted embeddings, e.g., pitch and timbre-based, although on their\nown, are not able to beat a fully end-to-end representation, yet adding these\ntogether with end-to-end embedding helps us, significantly improve performance.\nThis work would pave the way to bring some domain expertise with end-to-end\nmodels to learn robust, diverse representations, surpassing the performance of\njust training end-to-end models.\n","authors":["Prateek Verma"],"pdf_url":"https://arxiv.org/pdf/2309.08751v2.pdf","comment":"6 pages, 1 figure, 2 table, Under Review for 50th IEEE ICASSP 2025,\n Hyderabad, India"},{"id":"http://arxiv.org/abs/2409.08628v1","updated":"2024-09-13T08:33:03Z","published":"2024-09-13T08:33:03Z","title":"Rhythmic Foley: A Framework For Seamless Audio-Visual Alignment In\n Video-to-Audio Synthesis","summary":" Our research introduces an innovative framework for video-to-audio synthesis,\nwhich solves the problems of audio-video desynchronization and semantic loss in\nthe audio. By incorporating a semantic alignment adapter and a temporal\nsynchronization adapter, our method significantly improves semantic integrity\nand the precision of beat point synchronization, particularly in fast-paced\naction sequences. Utilizing a contrastive audio-visual pre-trained encoder, our\nmodel is trained with video and high-quality audio data, improving the quality\nof the generated audio. This dual-adapter approach empowers users with enhanced\ncontrol over audio semantics and beat effects, allowing the adjustment of the\ncontroller to achieve better results. Extensive experiments substantiate the\neffectiveness of our framework in achieving seamless audio-visual alignment.\n","authors":["Zhiqi Huang","Dan Luo","Jun Wang","Huan Liao","Zhiheng Li","Zhiyong Wu"],"pdf_url":"https://arxiv.org/pdf/2409.08628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07728v3","updated":"2024-09-13T07:41:02Z","published":"2024-07-10T15:00:08Z","title":"SaMoye: Zero-shot Singing Voice Conversion Model Based on Feature\n Disentanglement and Enhancement","summary":" Singing voice conversion (SVC) aims to convert a singer's voice to another\nsinger's from a reference audio while keeping the original semantics. However,\nexisting SVC methods can hardly perform zero-shot due to incomplete feature\ndisentanglement or dependence on the speaker look-up table. We propose the\nfirst open-source high-quality zero-shot SVC model SaMoye that can convert\nsinging to human and non-human timbre. SaMoye disentangles the singing voice's\nfeatures into content, timbre, and pitch features, where we combine multiple\nASR models and compress the content features to reduce timbre leaks. Besides,\nwe enhance the timbre features by unfreezing the speaker encoder and mixing the\nspeaker embedding with top-3 similar speakers. We also establish an\nunparalleled large-scale dataset to guarantee zero-shot performance, which\ncomprises more than 1,815 hours of pure singing voice and 6,367 speakers. We\nconduct objective and subjective experiments to find that SaMoye outperforms\nother models in zero-shot SVC tasks even under extreme conditions like\nconverting singing to animals' timbre. The code and weight of SaMoye are\navailable on https://github.com/CarlWangChina/SaMoye-SVC.\n","authors":["Zihao Wang","Le Ma","Yongsheng Feng","Xin Pan","Yuhang Jin","Kejun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.07728v3.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.08601v1","updated":"2024-09-13T07:31:44Z","published":"2024-09-13T07:31:44Z","title":"STA-V2A: Video-to-Audio Generation with Semantic and Temporal Alignment","summary":" Visual and auditory perception are two crucial ways humans experience the\nworld. Text-to-video generation has made remarkable progress over the past\nyear, but the absence of harmonious audio in generated video limits its broader\napplications. In this paper, we propose Semantic and Temporal Aligned\nVideo-to-Audio (STA-V2A), an approach that enhances audio generation from\nvideos by extracting both local temporal and global semantic video features and\ncombining these refined video features with text as cross-modal guidance. To\naddress the issue of information redundancy in videos, we propose an onset\nprediction pretext task for local temporal feature extraction and an attentive\npooling module for global semantic feature extraction. To supplement the\ninsufficient semantic information in videos, we propose a Latent Diffusion\nModel with Text-to-Audio priors initialization and cross-modal guidance. We\nalso introduce Audio-Audio Align, a new metric to assess audio-temporal\nalignment. Subjective and objective metrics demonstrate that our method\nsurpasses existing Video-to-Audio models in generating audio with better\nquality, semantic consistency, and temporal alignment. The ablation experiment\nvalidated the effectiveness of each module. Audio samples are available at\nhttps://y-ren16.github.io/STAV2A.\n","authors":["Yong Ren","Chenxing Li","Manjie Xu","Wei Liang","Yu Gu","Rilin Chen","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2409.08601v1.pdf","comment":"Submitted to ICASSP2025"},{"id":"http://arxiv.org/abs/2211.10881v3","updated":"2024-09-13T07:06:19Z","published":"2022-11-20T06:31:23Z","title":"Deepfake Detection: A Comprehensive Survey from the Reliability\n Perspective","summary":" The mushroomed Deepfake synthetic materials circulated on the internet have\nraised a profound social impact on politicians, celebrities, and individuals\nworldwide. In this survey, we provide a thorough review of the existing\nDeepfake detection studies from the reliability perspective. We identify three\nreliability-oriented research challenges in the current Deepfake detection\ndomain: transferability, interpretability, and robustness. Moreover, while\nsolutions have been frequently addressed regarding the three challenges, the\ngeneral reliability of a detection model has been barely considered, leading to\nthe lack of reliable evidence in real-life usages and even for prosecutions on\nDeepfake-related cases in court. We, therefore, introduce a model reliability\nstudy metric using statistical random sampling knowledge and the publicly\navailable benchmark datasets to review the reliability of the existing\ndetection models on arbitrary Deepfake candidate suspects. Case studies are\nfurther executed to justify the real-life Deepfake cases including different\ngroups of victims with the help of the reliably qualified detection models as\nreviewed in this survey. Reviews and experiments on the existing approaches\nprovide informative discussions and future research directions for Deepfake\ndetection.\n","authors":["Tianyi Wang","Xin Liao","Kam Pui Chow","Xiaodong Lin","Yinglong Wang"],"pdf_url":"https://arxiv.org/pdf/2211.10881v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08489v1","updated":"2024-09-13T02:32:10Z","published":"2024-09-13T02:32:10Z","title":"Confidence Calibration for Audio Captioning Models","summary":" Systems that automatically generate text captions for audio, images and video\nlack a confidence indicator of the relevance and correctness of the generated\nsequences. To address this, we build on existing methods of confidence\nmeasurement for text by introduce selective pooling of token probabilities,\nwhich aligns better with traditional correctness measures than conventional\npooling does. Further, we propose directly measuring the similarity between\ninput audio and text in a shared embedding space. To measure self-consistency,\nwe adapt semantic entropy for audio captioning, and find that these two methods\nalign even better than pooling-based metrics with the correctness measure that\ncalculates acoustic similarity between captions. Finally, we explain why\ntemperature scaling of confidences improves calibration.\n","authors":["Rehana Mahfuz","Yinyi Guo","Erik Visser"],"pdf_url":"https://arxiv.org/pdf/2409.08489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08353v2","updated":"2024-09-13T01:48:15Z","published":"2024-06-12T15:59:25Z","title":"Speech Emotion Recognition with ASR Transcripts: A Comprehensive Study\n on Word Error Rate and Fusion Techniques","summary":" Text data is commonly utilized as a primary input to enhance Speech Emotion\nRecognition (SER) performance and reliability. However, the reliance on\nhuman-transcribed text in most studies impedes the development of practical SER\nsystems, creating a gap between in-lab research and real-world scenarios where\nAutomatic Speech Recognition (ASR) serves as the text source. Hence, this study\nbenchmarks SER performance using ASR transcripts with varying Word Error Rates\n(WERs) from eleven models on three well-known corpora: IEMOCAP, CMU-MOSI, and\nMSP-Podcast. Our evaluation includes both text-only and bimodal SER with six\nfusion techniques, aiming for a comprehensive analysis that uncovers novel\nfindings and challenges faced by current SER research. Additionally, we propose\na unified ASR error-robust framework integrating ASR error correction and\nmodality-gated fusion, achieving lower WER and higher SER results compared to\nthe best-performing ASR transcript. These findings provide insights into SER\nwith ASR assistance, especially for real-world applications.\n","authors":["Yuanchao Li","Peter Bell","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2406.08353v2.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2401.17800v2","updated":"2024-09-13T01:43:44Z","published":"2024-01-31T12:51:26Z","title":"Dance-to-Music Generation with Encoder-based Textual Inversion","summary":" The seamless integration of music with dance movements is essential for\ncommunicating the artistic intent of a dance piece. This alignment also\nsignificantly improves the immersive quality of gaming experiences and\nanimation productions. Although there has been remarkable advancement in\ncreating high-fidelity music from textual descriptions, current methodologies\nmainly focus on modulating overall characteristics such as genre and emotional\ntone. They often overlook the nuanced management of temporal rhythm, which is\nindispensable in crafting music for dance, since it intricately aligns the\nmusical beats with the dancers' movements. Recognizing this gap, we propose an\nencoder-based textual inversion technique to augment text-to-music models with\nvisual control, facilitating personalized music generation. Specifically, we\ndevelop dual-path rhythm-genre inversion to effectively integrate the rhythm\nand genre of a dance motion sequence into the textual space of a text-to-music\nmodel. Contrary to traditional textual inversion methods, which directly update\ntext embeddings to reconstruct a single target object, our approach utilizes\nseparate rhythm and genre encoders to obtain text embeddings for two\npseudo-words, adapting to the varying rhythms and genres. We collect a new\ndataset called In-the-wild Dance Videos (InDV) and demonstrate that our\napproach outperforms state-of-the-art methods across multiple evaluation\nmetrics. Furthermore, our method is able to adapt to changes in tempo and\neffectively integrates with the inherent text-guided generation capability of\nthe pre-trained model. Our source code and demo videos are available at\n\\url{https://github.com/lsfhuihuiff/Dance-to-music_Siggraph_Asia_2024}\n","authors":["Sifei Li","Weiming Dong","Yuxin Zhang","Fan Tang","Chongyang Ma","Oliver Deussen","Tong-Yee Lee","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.17800v2.pdf","comment":"11 pages, 5 figures, SIGGRAPH ASIA 2024"}]},"2024-09-12T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2403.11793v2","updated":"2024-09-12T23:08:08Z","published":"2024-03-18T13:50:50Z","title":"Reasoning Abilities of Large Language Models: In-Depth Analysis on the\n Abstraction and Reasoning Corpus","summary":" The existing methods for evaluating the inference abilities of Large Language\nModels (LLMs) have been results-centric, making it difficult to assess the\ninference process. We introduce a new approach using the Abstraction and\nReasoning Corpus (ARC) dataset to evaluate the inference and contextual\nunderstanding abilities of large language models in a process-centric manner.\nARC demands rigorous logical structures for problem-solving, making it a\nbenchmark that facilitates the comparison of model inference abilities with\nhumans. Experimental results confirm that while large language models possess\nweak inference abilities, they still lag in terms of logical coherence,\ncompositionality, and productivity. Our experiments highlight the reasoning\ncapabilities of LLMs, proposing development paths for achieving human-level\nreasoning.\n","authors":["Seungpil Lee","Woochang Sim","Donghyeon Shin","Wongyu Seo","Jiwon Park","Seokki Lee","Sanha Hwang","Sejin Kim","Sundong Kim"],"pdf_url":"https://arxiv.org/pdf/2403.11793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02764v2","updated":"2024-09-12T22:18:03Z","published":"2024-05-04T22:00:28Z","title":"Assessing Adversarial Robustness of Large Language Models: An Empirical\n Study","summary":" Large Language Models (LLMs) have revolutionized natural language processing,\nbut their robustness against adversarial attacks remains a critical concern. We\npresents a novel white-box style attack approach that exposes vulnerabilities\nin leading open-source LLMs, including Llama, OPT, and T5. We assess the impact\nof model size, structure, and fine-tuning strategies on their resistance to\nadversarial perturbations. Our comprehensive evaluation across five diverse\ntext classification tasks establishes a new benchmark for LLM robustness. The\nfindings of this study have far-reaching implications for the reliable\ndeployment of LLMs in real-world applications and contribute to the advancement\nof trustworthy AI systems.\n","authors":["Zeyu Yang","Zhao Meng","Xiaochen Zheng","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2405.02764v2.pdf","comment":"Oral presentation at KDD 2024 GenAI Evaluation workshop"},{"id":"http://arxiv.org/abs/2310.00898v4","updated":"2024-09-12T22:09:25Z","published":"2023-10-02T04:29:40Z","title":"Enabling Language Models to Implicitly Learn Self-Improvement","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nopen-ended text generation tasks. However, the inherent open-ended nature of\nthese tasks implies that there is always room for improvement in the quality of\nmodel responses. To address this challenge, various approaches have been\nproposed to enhance the performance of LLMs. There has been a growing focus on\nenabling LLMs to self-improve their response quality, thereby reducing the\nreliance on extensive human annotation efforts for collecting diverse and\nhigh-quality training data. Recently, prompting-based methods have been widely\nexplored among self-improvement methods owing to their effectiveness,\nefficiency, and convenience. However, those methods usually require explicitly\nand thoroughly written rubrics as inputs to LLMs. It is expensive and\nchallenging to manually derive and provide all necessary rubrics with a\nreal-world complex goal for improvement (e.g., being more helpful and less\nharmful). To this end, we propose an ImPlicit Self-ImprovemenT (PIT) framework\nthat implicitly learns the improvement goal from human preference data. PIT\nonly requires preference data that are used to train reward models without\nextra human efforts. Specifically, we reformulate the training objective of\nreinforcement learning from human feedback (RLHF) -- instead of maximizing\nresponse quality for a given input, we maximize the quality gap of the response\nconditioned on a reference response. In this way, PIT is implicitly trained\nwith the improvement goal of better aligning with human preferences.\nExperiments on two real-world datasets and one synthetic dataset show that our\nmethod significantly outperforms prompting-based methods.\n","authors":["Ziqi Wang","Le Hou","Tianjian Lu","Yuexin Wu","Yunxuan Li","Hongkun Yu","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2310.00898v4.pdf","comment":"Accepted at ICLR 2024. 28 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2303.15987v3","updated":"2024-09-12T21:44:01Z","published":"2023-03-28T14:02:42Z","title":"Sentiment Analysis Dataset in Moroccan Dialect: Bridging the Gap Between\n Arabic and Latin Scripted dialect","summary":" Sentiment analysis, the automated process of determining emotions or opinions\nexpressed in text, has seen extensive exploration in the field of natural\nlanguage processing. However, one aspect that has remained underrepresented is\nthe sentiment analysis of the Moroccan dialect, which boasts a unique\nlinguistic landscape and the coexistence of multiple scripts. Previous works in\nsentiment analysis primarily targeted dialects employing Arabic script. While\nthese efforts provided valuable insights, they may not fully capture the\ncomplexity of Moroccan web content, which features a blend of Arabic and Latin\nscript. As a result, our study emphasizes the importance of extending sentiment\nanalysis to encompass the entire spectrum of Moroccan linguistic diversity.\nCentral to our research is the creation of the largest public dataset for\nMoroccan dialect sentiment analysis that incorporates not only Moroccan dialect\nwritten in Arabic script but also in Latin letters. By assembling a diverse\nrange of textual data, we were able to construct a dataset with a range of 20\n000 manually labeled text in Moroccan dialect and also publicly available lists\nof stop words in Moroccan dialect. To dive into sentiment analysis, we\nconducted a comparative study on multiple Machine learning models to assess\ntheir compatibility with our dataset. Experiments were performed using both raw\nand preprocessed data to show the importance of the preprocessing step. We were\nable to achieve 92% accuracy in our model and to further prove its liability we\ntested our model on smaller publicly available datasets of Moroccan dialect and\nthe results were favorable.\n","authors":["Mouad Jbel","Mourad Jabrane","Imad Hafidi","Abdulmutallib Metrane"],"pdf_url":"https://arxiv.org/pdf/2303.15987v3.pdf","comment":"Lang Resources & Evaluation (2024)"},{"id":"http://arxiv.org/abs/2409.08406v1","updated":"2024-09-12T21:39:01Z","published":"2024-09-12T21:39:01Z","title":"Knowledge Tagging with Large Language Model based Multi-Agent System","summary":" Knowledge tagging for questions is vital in modern intelligent educational\napplications, including learning progress diagnosis, practice question\nrecommendations, and course content organization. Traditionally, these\nannotations have been performed by pedagogical experts, as the task demands not\nonly a deep semantic understanding of question stems and knowledge definitions\nbut also a strong ability to link problem-solving logic with relevant knowledge\nconcepts. With the advent of advanced natural language processing (NLP)\nalgorithms, such as pre-trained language models and large language models\n(LLMs), pioneering studies have explored automating the knowledge tagging\nprocess using various machine learning models. In this paper, we investigate\nthe use of a multi-agent system to address the limitations of previous\nalgorithms, particularly in handling complex cases involving intricate\nknowledge definitions and strict numerical constraints. By demonstrating its\nsuperior performance on the publicly available math question knowledge tagging\ndataset, MathKnowCT, we highlight the significant potential of an LLM-based\nmulti-agent system in overcoming the challenges that previous methods have\nencountered. Finally, through an in-depth discussion of the implications of\nautomating knowledge tagging, we underscore the promising results of deploying\nLLM-based algorithms in educational contexts.\n","authors":["Hang Li","Tianlong Xu","Ethan Chang","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2409.08406v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.08386v1","updated":"2024-09-12T20:32:07Z","published":"2024-09-12T20:32:07Z","title":"Self-Supervised Inference of Agents in Trustless Environments","summary":" In this paper, we propose a novel approach where agents can form swarms to\nproduce high-quality responses effectively. This is accomplished by utilizing\nagents capable of data inference and ranking, which can be effectively\nimplemented using LLMs as response classifiers. We assess existing approaches\nfor trustless agent inference, define our methodology, estimate practical\nparameters, and model various types of malicious agent attacks. Our method\nleverages the collective intelligence of swarms, ensuring robust and efficient\ndecentralized AI inference with better accuracy, security, and reliability. We\nshow that our approach is an order of magnitude faster than other trustless\ninference strategies reaching less than 125 ms validation latency.\n","authors":["Vladyslav Larin","Ivan Nikitin","Alexander Firsov"],"pdf_url":"https://arxiv.org/pdf/2409.08386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08381v1","updated":"2024-09-12T20:02:51Z","published":"2024-09-12T20:02:51Z","title":"Rethinking Prompting Strategies for Multi-Label Recognition with Partial\n Annotations","summary":" Vision-language models (VLMs) like CLIP have been adapted for Multi-Label\nRecognition (MLR) with partial annotations by leveraging prompt-learning, where\npositive and negative prompts are learned for each class to associate their\nembeddings with class presence or absence in the shared vision-text feature\nspace. While this approach improves MLR performance by relying on VLM priors,\nwe hypothesize that learning negative prompts may be suboptimal, as the\ndatasets used to train VLMs lack image-caption pairs explicitly focusing on\nclass absence. To analyze the impact of positive and negative prompt learning\non MLR, we introduce PositiveCoOp and NegativeCoOp, where only one prompt is\nlearned with VLM guidance while the other is replaced by an embedding vector\nlearned directly in the shared feature space without relying on the text\nencoder. Through empirical analysis, we observe that negative prompts degrade\nMLR performance, and learning only positive prompts, combined with learned\nnegative embeddings (PositiveCoOp), outperforms dual prompt learning\napproaches. Moreover, we quantify the performance benefits that prompt-learning\noffers over a simple vision-features-only baseline, observing that the baseline\ndisplays strong performance comparable to dual prompt learning approach\n(DualCoOp), when the proportion of missing labels is low, while requiring half\nthe training compute and 16 times fewer parameters\n","authors":["Samyak Rawlekar","Shubhang Bhatnagar","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2409.08381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04037v2","updated":"2024-09-12T19:54:37Z","published":"2022-12-08T02:21:47Z","title":"Demystifying Prompts in Language Models via Perplexity Estimation","summary":" Language models can be prompted to perform a wide variety of zero- and\nfew-shot learning problems. However, performance varies significantly with the\nchoice of prompt, and we do not yet understand why this happens or how to pick\nthe best prompts. In this work, we analyze the factors that contribute to this\nvariance and establish a new empirical hypothesis: the performance of a prompt\nis coupled with the extent to which the model is familiar with the language it\ncontains. Over a wide range of tasks, we show that the lower the perplexity of\nthe prompt is, the better the prompt is able to perform the task. As a result,\nwe devise a method for creating prompts: (1) automatically extend a small seed\nset of manually written prompts by paraphrasing using GPT3 and backtranslation\nand (2) choose the lowest perplexity prompts to get significant gains in\nperformance.\n","authors":["Hila Gonen","Srini Iyer","Terra Blevins","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2212.04037v2.pdf","comment":"Published in Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2408.04667v2","updated":"2024-09-12T19:15:17Z","published":"2024-08-06T16:43:35Z","title":"LLM Stability: A detailed analysis with some surprises","summary":" LLM (large language model) practitioners commonly notice that outputs can\nvary for the same inputs, but we have been unable to find work that evaluates\nLLM stability as the main objective. In our study of 6 deterministically\nconfigured LLMs across 8 common tasks with 5 identical runs, we see accuracy\nvariations up to 10\\%. In addition, no LLM consistently delivers repeatable\naccuracy across all tasks. We also show examples of variation that are not\nnormally distributed and compare configurations with zero-shot/few-shot\nprompting and fine-tuned examples. To better quantify what is going on, we\nintroduce metrics focused on stability: TARr@N for the total agreement rate at\nN runs over raw output, and TARa@N for total agreement over parsed-out answers.\nWe suggest that stability metrics be integrated into leader boards and research\nresults going forward.\n","authors":["Berk Atil","Alexa Chittams","Liseng Fu","Ferhan Ture","Lixinyu Xu","Breck Baldwin"],"pdf_url":"https://arxiv.org/pdf/2408.04667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06518v2","updated":"2024-09-12T18:33:33Z","published":"2024-08-12T22:30:55Z","title":"Does Liking Yellow Imply Driving a School Bus? Semantic Leakage in\n Language Models","summary":" Despite their wide adoption, the biases and unintended behaviors of language\nmodels remain poorly understood. In this paper, we identify and characterize a\nphenomenon never discussed before, which we call semantic leakage, where models\nleak irrelevant information from the prompt into the generation in unexpected\nways. We propose an evaluation setting to detect semantic leakage both by\nhumans and automatically, curate a diverse test suite for diagnosing this\nbehavior, and measure significant semantic leakage in 13 flagship models. We\nalso show that models exhibit semantic leakage in languages besides English and\nacross different settings and generation scenarios. This discovery highlights\nyet another type of bias in language models that affects their generation\npatterns and behavior.\n","authors":["Hila Gonen","Terra Blevins","Alisa Liu","Luke Zettlemoyer","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2408.06518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18659v3","updated":"2024-09-12T18:30:26Z","published":"2024-02-28T19:09:08Z","title":"Large Language Models and Games: A Survey and Roadmap","summary":" Recent years have seen an explosive increase in research on large language\nmodels (LLMs), and accompanying public engagement on the topic. While starting\nas a niche area within natural language processing, LLMs have shown remarkable\npotential across a broad range of applications and domains, including games.\nThis paper surveys the current state of the art across the various applications\nof LLMs in and for games, and identifies the different roles LLMs can take\nwithin a game. Importantly, we discuss underexplored areas and promising\ndirections for future uses of LLMs in games and we reconcile the potential and\nlimitations of LLMs within the games domain. As the first comprehensive survey\nand roadmap at the intersection of LLMs and games, we are hopeful that this\npaper will serve as the basis for groundbreaking research and innovation in\nthis exciting new field.\n","authors":["Roberto Gallotta","Graham Todd","Marvin Zammit","Sam Earle","Antonios Liapis","Julian Togelius","Georgios N. Yannakakis"],"pdf_url":"https://arxiv.org/pdf/2402.18659v3.pdf","comment":"Accepted for publication at the IEEE Transactions on Games (18 pages,\n 6 figures)"},{"id":"http://arxiv.org/abs/2403.10704v2","updated":"2024-09-12T18:25:16Z","published":"2024-03-15T21:43:46Z","title":"Parameter Efficient Reinforcement Learning from Human Feedback","summary":" While Reinforcement Learning from Human Feedback (RLHF) effectively aligns\npretrained Large Language and Vision-Language Models (LLMs, and VLMs) with\nhuman preferences, its computational cost and complexity hamper its wider\nadoption. To alleviate some of the computational burden of fine-tuning,\nparameter efficient methods, like LoRA were introduced. In this work, we\nempirically evaluate the setup of Parameter Efficient Reinforcement Learning\nfrom Human Feedback (PE-RLHF) that leverages LoRA fine-tuning for Reward\nModeling, and Reinforcement Learning. We benchmark the PE-RLHF setup on six\ndiverse datasets spanning summarization, harmless/helpful response generation,\nUI automation, and visual question answering in terms of effectiveness of the\ntrained models, and the training resources required. Our findings show, for the\nfirst time, that PE-RLHF achieves comparable performance to RLHF, while\nsignificantly reducing training time (up to 90% faster for reward models, and\n30% faster for RL), and memory footprint (up to 50% reduction for reward\nmodels, and 27% for RL). We provide comprehensive ablations across LoRA ranks,\nand model sizes for both reward modeling and reinforcement learning. By\nmitigating the computational burden associated with RLHF, we push for a broader\nadoption of PE-RLHF as an alignment technique for LLMs and VLMs.\n","authors":["Hakim Sidahmed","Samrat Phatale","Alex Hutcheson","Zhuonan Lin","Zhang Chen","Zac Yu","Jarvis Jin","Simral Chaudhary","Roman Komarytsia","Christiane Ahlheim","Yonghao Zhu","Bowen Li","Saravanan Ganesh","Bill Byrne","Jessica Hoffmann","Hassan Mansoor","Wei Li","Abhinav Rastogi","Lucas Dixon"],"pdf_url":"https://arxiv.org/pdf/2403.10704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08346v1","updated":"2024-09-12T18:18:22Z","published":"2024-09-12T18:18:22Z","title":"Towards Quantifying and Reducing Language Mismatch Effects in\n Cross-Lingual Speech Anti-Spoofing","summary":" The effects of language mismatch impact speech anti-spoofing systems, while\ninvestigations and quantification of these effects remain limited. Existing\nanti-spoofing datasets are mainly in English, and the high cost of acquiring\nmultilingual datasets hinders training language-independent models. We initiate\nthis work by evaluating top-performing speech anti-spoofing systems that are\ntrained on English data but tested on other languages, observing notable\nperformance declines. We propose an innovative approach - Accent-based data\nexpansion via TTS (ACCENT), which introduces diverse linguistic knowledge to\nmonolingual-trained models, improving their cross-lingual capabilities. We\nconduct experiments on a large-scale dataset consisting of over 3 million\nsamples, including 1.8 million training samples and nearly 1.2 million testing\nsamples across 12 languages. The language mismatch effects are preliminarily\nquantified and remarkably reduced over 15% by applying the proposed ACCENT.\nThis easily implementable method shows promise for multilingual and\nlow-resource language scenarios.\n","authors":["Tianchi Liu","Ivan Kukanov","Zihan Pan","Qiongqiong Wang","Hardik B. Sailor","Kong Aik Lee"],"pdf_url":"https://arxiv.org/pdf/2409.08346v1.pdf","comment":"Accepted to the IEEE Spoken Language Technology Workshop (SLT) 2024.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2409.08330v1","updated":"2024-09-12T18:00:18Z","published":"2024-09-12T18:00:18Z","title":"Real or Robotic? Assessing Whether LLMs Accurately Simulate Qualities of\n Human Responses in Dialogue","summary":" Studying and building datasets for dialogue tasks is both expensive and\ntime-consuming due to the need to recruit, train, and collect data from study\nparticipants. In response, much recent work has sought to use large language\nmodels (LLMs) to simulate both human-human and human-LLM interactions, as they\nhave been shown to generate convincingly human-like text in many settings.\nHowever, to what extent do LLM-based simulations \\textit{actually} reflect\nhuman dialogues? In this work, we answer this question by generating a\nlarge-scale dataset of 100,000 paired LLM-LLM and human-LLM dialogues from the\nWildChat dataset and quantifying how well the LLM simulations align with their\nhuman counterparts. Overall, we find relatively low alignment between\nsimulations and human interactions, demonstrating a systematic divergence along\nthe multiple textual properties, including style and content. Further, in\ncomparisons of English, Chinese, and Russian dialogues, we find that models\nperform similarly. Our results suggest that LLMs generally perform better when\nthe human themself writes in a way that is more similar to the LLM's own style.\n","authors":["Johnathan Ivey","Shivani Kumar","Jiayu Liu","Hua Shen","Sushrita Rakshit","Rohan Raju","Haotian Zhang","Aparna Ananthasubramaniam","Junghwan Kim","Bowen Yi","Dustin Wright","Abraham Israeli","Anders Giovanni Møller","Lechen Zhang","David Jurgens"],"pdf_url":"https://arxiv.org/pdf/2409.08330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15365v3","updated":"2024-09-12T17:49:05Z","published":"2024-03-22T17:33:11Z","title":"A Transfer Attack to Image Watermarks","summary":" Watermark has been widely deployed by industry to detect AI-generated images.\nThe robustness of such watermark-based detector against evasion attacks in the\nwhite-box and black-box settings is well understood in the literature. However,\nthe robustness in the no-box setting is much less understood. In this work, we\npropose a new transfer evasion attack to image watermark in the no-box setting.\nOur transfer attack adds a perturbation to a watermarked image to evade\nmultiple surrogate watermarking models trained by the attacker itself, and the\nperturbed watermarked image also evades the target watermarking model. Our\nmajor contribution is to show that, both theoretically and empirically,\nwatermark-based AI-generated image detector is not robust to evasion attacks\neven if the attacker does not have access to the watermarking model nor the\ndetection API.\n","authors":["Yuepeng Hu","Zhengyuan Jiang","Moyang Guo","Neil Gong"],"pdf_url":"https://arxiv.org/pdf/2403.15365v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08239v1","updated":"2024-09-12T17:39:08Z","published":"2024-09-12T17:39:08Z","title":"Source2Synth: Synthetic Data Generation and Curation Grounded in Real\n Data Sources","summary":" Large Language Models still struggle in challenging scenarios that leverage\nstructured data, complex reasoning, or tool usage. In this paper, we propose\nSource2Synth: a new method that can be used for teaching LLMs new skills\nwithout relying on costly human annotations. Source2Synth takes as input a\ncustom data source and produces synthetic data points with intermediate\nreasoning steps grounded in real-world sources. Source2Synth improves the\ndataset quality by discarding low-quality generations based on their\nanswerability. We demonstrate the generality of this approach by applying it to\ntwo challenging domains: we test reasoning abilities in multi-hop question\nanswering (MHQA), and tool usage in tabular question answering (TQA). Our\nmethod improves performance by 25.51% for TQA on WikiSQL and 22.57% for MHQA on\nHotPotQA compared to the fine-tuned baselines.\n","authors":["Alisia Lupidi","Carlos Gemmell","Nicola Cancedda","Jane Dwivedi-Yu","Jason Weston","Jakob Foerster","Roberta Raileanu","Maria Lomeli"],"pdf_url":"https://arxiv.org/pdf/2409.08239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08234v1","updated":"2024-09-12T17:33:06Z","published":"2024-09-12T17:33:06Z","title":"LLM Honeypot: Leveraging Large Language Models as Advanced Interactive\n Honeypot Systems","summary":" The rapid evolution of cyber threats necessitates innovative solutions for\ndetecting and analyzing malicious activity. Honeypots, which are decoy systems\ndesigned to lure and interact with attackers, have emerged as a critical\ncomponent in cybersecurity. In this paper, we present a novel approach to\ncreating realistic and interactive honeypot systems using Large Language Models\n(LLMs). By fine-tuning a pre-trained open-source language model on a diverse\ndataset of attacker-generated commands and responses, we developed a honeypot\ncapable of sophisticated engagement with attackers. Our methodology involved\nseveral key steps: data collection and processing, prompt engineering, model\nselection, and supervised fine-tuning to optimize the model's performance.\nEvaluation through similarity metrics and live deployment demonstrated that our\napproach effectively generates accurate and informative responses. The results\nhighlight the potential of LLMs to revolutionize honeypot technology, providing\ncybersecurity professionals with a powerful tool to detect and analyze\nmalicious activity, thereby enhancing overall security infrastructure.\n","authors":["Hakan T. Otal","M. Abdullah Canbaz"],"pdf_url":"https://arxiv.org/pdf/2409.08234v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.08202v1","updated":"2024-09-12T16:41:47Z","published":"2024-09-12T16:41:47Z","title":"What Makes a Maze Look Like a Maze?","summary":" A unique aspect of human visual understanding is the ability to flexibly\ninterpret abstract concepts: acquiring lifted rules explaining what they\nsymbolize, grounding them across familiar and unfamiliar contexts, and making\npredictions or reasoning about them. While off-the-shelf vision-language models\nexcel at making literal interpretations of images (e.g., recognizing object\ncategories such as tree branches), they still struggle to make sense of such\nvisual abstractions (e.g., how an arrangement of tree branches may form the\nwalls of a maze). To address this challenge, we introduce Deep Schema Grounding\n(DSG), a framework that leverages explicit structured representations of visual\nabstractions for grounding and reasoning. At the core of DSG are\nschemas--dependency graph descriptions of abstract concepts that decompose them\ninto more primitive-level symbols. DSG uses large language models to extract\nschemas, then hierarchically grounds concrete to abstract components of the\nschema onto images with vision-language models. The grounded schema is used to\naugment visual abstraction understanding. We systematically evaluate DSG and\ndifferent methods in reasoning on our new Visual Abstractions Dataset, which\nconsists of diverse, real-world images of abstract concepts and corresponding\nquestion-answer pairs labeled by humans. We show that DSG significantly\nimproves the abstract visual reasoning performance of vision-language models,\nand is a step toward human-aligned understanding of visual abstractions.\n","authors":["Joy Hsu","Jiayuan Mao","Joshua B. Tenenbaum","Noah D. Goodman","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2409.08202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08199v1","updated":"2024-09-12T16:36:39Z","published":"2024-09-12T16:36:39Z","title":"AudioBERT: Audio Knowledge Augmented Language Model","summary":" Recent studies have identified that language models, pretrained on text-only\ndatasets, often lack elementary visual knowledge, \\textit{e.g.,} colors of\neveryday objects. Motivated by this observation, we ask whether a similar\nshortcoming exists in terms of the \\textit{auditory} knowledge. To answer this\nquestion, we construct a new dataset called AuditoryBench, which consists of\ntwo novel tasks for evaluating auditory knowledge. Based on our analysis using\nthe benchmark, we find that language models also suffer from a severe lack of\nauditory knowledge. To address this limitation, we propose AudioBERT, a novel\nmethod to augment the auditory knowledge of BERT through a retrieval-based\napproach. First, we detect auditory knowledge spans in prompts to query our\nretrieval model efficiently. Then, we inject audio knowledge into BERT and\nswitch on low-rank adaptation for effective adaptation when audio knowledge is\nrequired. Our experiments demonstrate that AudioBERT is quite effective,\nachieving superior performance on the AuditoryBench. The dataset and code are\navailable at \\bulurl{https://github.com/HJ-Ok/AudioBERT}.\n","authors":["Hyunjong Ok","Suho Yoo","Jaeho Lee"],"pdf_url":"https://arxiv.org/pdf/2409.08199v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.15590v2","updated":"2024-09-12T16:34:17Z","published":"2024-05-24T14:20:45Z","title":"Profiling checkpointing schedules in adjoint ST-AD","summary":" Checkpointing is a cornerstone of data-flow reversal in adjoint algorithmic\ndifferentiation. Checkpointing is a storage/recomputation trade-off that can be\napplied at different levels, one of which being the call tree. We are looking\nfor good placements of checkpoints onto the call tree of a given application,\nto reduce run time and memory footprint of its adjoint. There is no known\noptimal solution to this problem other than a combinatorial search on all\nplacements. We propose a heuristics based on run-time profiling of the adjoint\ncode. We describe implementation of this profiling tool in an existing\nsource-transformation AD tool. We demonstrate the interest of this approach on\ntest cases taken from the MITgcm ocean and atmospheric global circulation\nmodel. We discuss the limitations of our approach and propose directions to\nlift them.\n","authors":["Laurent Hascoët","Jean-Luc Bouchot","Shreyas Sunil Gaikwad","Sri Hari Krishna Narayanan","Jan Hückelheim"],"pdf_url":"https://arxiv.org/pdf/2405.15590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08185v1","updated":"2024-09-12T16:20:57Z","published":"2024-09-12T16:20:57Z","title":"Fine-tuning Large Language Models for Entity Matching","summary":" Generative large language models (LLMs) are a promising alternative to\npre-trained language models for entity matching due to their high zero-shot\nperformance and their ability to generalize to unseen entities. Existing\nresearch on using LLMs for entity matching has focused on prompt engineering\nand in-context learning. This paper explores the potential of fine-tuning LLMs\nfor entity matching. We analyze fine-tuning along two dimensions: 1) The\nrepresentation of training examples, where we experiment with adding different\ntypes of LLM-generated explanations to the training set, and 2) the selection\nand generation of training examples using LLMs. In addition to the matching\nperformance on the source dataset, we investigate how fine-tuning affects the\nmodel's ability to generalize to other in-domain datasets as well as across\ntopical domains. Our experiments show that fine-tuning significantly improves\nthe performance of the smaller models while the results for the larger models\nare mixed. Fine-tuning also improves the generalization to in-domain datasets\nwhile hurting cross-domain transfer. We show that adding structured\nexplanations to the training set has a positive impact on the performance of\nthree out of four LLMs, while the proposed example selection and generation\nmethods only improve the performance of Llama 3.1 8B while decreasing the\nperformance of GPT-4o Mini.\n","authors":["Aaron Steiner","Ralph Peeters","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2409.08185v1.pdf","comment":"8 pages, 4 figures. For related code and data, see this\n https://github.com/wbsg-uni-mannheim/TailorMatch"},{"id":"http://arxiv.org/abs/2408.14809v2","updated":"2024-09-12T16:11:41Z","published":"2024-08-27T06:44:28Z","title":"GSIFN: A Graph-Structured and Interlaced-Masked Multimodal\n Transformer-based Fusion Network for Multimodal Sentiment Analysis","summary":" Multimodal Sentiment Analysis (MSA) leverages multiple data modals to analyze\nhuman sentiment. Existing MSA models generally employ cutting-edge multimodal\nfusion and representation learning-based methods to promote MSA capability.\nHowever, there are two key challenges: (i) in existing multimodal fusion\nmethods, the decoupling of modal combinations and tremendous parameter\nredundancy, lead to insufficient fusion performance and efficiency; (ii) a\nchallenging trade-off exists between representation capability and\ncomputational overhead in unimodal feature extractors and encoders. Our\nproposed GSIFN incorporates two main components to solve these problems: (i) a\ngraph-structured and interlaced-masked multimodal Transformer. It adopts the\nInterlaced Mask mechanism to construct robust multimodal graph embedding,\nachieve all-modal-in-one Transformer-based fusion, and greatly reduce the\ncomputational overhead; (ii) a self-supervised learning framework with low\ncomputational overhead and high performance, which utilizes a parallelized LSTM\nwith matrix memory to enhance non-verbal modal features for unimodal label\ngeneration. Evaluated on the MSA datasets CMU-MOSI, CMU-MOSEI, and CH-SIMS,\nGSIFN demonstrates superior performance with significantly lower computational\noverhead compared with previous state-of-the-art models.\n","authors":["Yijie Jin"],"pdf_url":"https://arxiv.org/pdf/2408.14809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13378v2","updated":"2024-09-12T16:06:37Z","published":"2024-08-23T21:24:59Z","title":"DrugAgent: Explainable Drug Repurposing Agent with Large Language\n Model-based Reasoning","summary":" Drug repurposing offers a promising avenue for accelerating drug development\nby identifying new therapeutic potentials of existing drugs. In this paper, we\npropose a multi-agent framework to enhance the drug repurposing process using\nstate-of-the-art machine learning techniques and knowledge integration. Our\nframework comprises several specialized agents: an AI Agent trains robust\ndrug-target interaction (DTI) models; a Knowledge Graph Agent utilizes the\ndrug-gene interaction database (DGIdb), DrugBank, Comparative Toxicogenomics\nDatabase (CTD), and Search Tool for Interactions of Chemicals (STITCH) to\nsystematically extract DTIs; and a Search Agent interacts with biomedical\nliterature to annotate and verify computational predictions. By integrating\noutputs from these agents, our system effectively harnesses diverse data\nsources, including external databases, to propose viable repurposing\ncandidates. Preliminary results demonstrate the potential of our approach in\nnot only predicting drug-disease interactions but also in reducing the time and\ncost associated with traditional drug discovery methods. This paper highlights\nthe scalability of multi-agent systems in biomedical research and their role in\ndriving innovation in drug repurposing. Our approach not only outperforms\nexisting methods in predicting drug repurposing potential but also provides\ninterpretable results, paving the way for more efficient and cost-effective\ndrug discovery processes.\n","authors":["Yoshitaka Inoue","Tianci Song","Tianfan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.13378v2.pdf","comment":"18 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.08160v1","updated":"2024-09-12T15:52:22Z","published":"2024-09-12T15:52:22Z","title":"On the Role of Context in Reading Time Prediction","summary":" We present a new perspective on how readers integrate context during\nreal-time language comprehension. Our proposals build on surprisal theory,\nwhich posits that the processing effort of a linguistic unit (e.g., a word) is\nan affine function of its in-context information content. We first observe that\nsurprisal is only one out of many potential ways that a contextual predictor\ncan be derived from a language model. Another one is the pointwise mutual\ninformation (PMI) between a unit and its context, which turns out to yield the\nsame predictive power as surprisal when controlling for unigram frequency.\nMoreover, both PMI and surprisal are correlated with frequency. This means that\nneither PMI nor surprisal contains information about context alone. In response\nto this, we propose a technique where we project surprisal onto the orthogonal\ncomplement of frequency, yielding a new contextual predictor that is\nuncorrelated with frequency. Our experiments show that the proportion of\nvariance in reading times explained by context is a lot smaller when context is\nrepresented by the orthogonalized predictor. From an interpretability\nstandpoint, this indicates that previous studies may have overstated the role\nthat context has in predicting reading times.\n","authors":["Andreas Opedal","Eleanor Chodroff","Ryan Cotterell","Ethan Gotlieb Wilcox"],"pdf_url":"https://arxiv.org/pdf/2409.08160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08147v1","updated":"2024-09-12T15:40:45Z","published":"2024-09-12T15:40:45Z","title":"LLM-POTUS Score: A Framework of Analyzing Presidential Debates with\n Large Language Models","summary":" Large language models have demonstrated remarkable capabilities in natural\nlanguage processing, yet their application to political discourse analysis\nremains underexplored. This paper introduces a novel approach to evaluating\npresidential debate performances using LLMs, addressing the longstanding\nchallenge of objectively assessing debate outcomes. We propose a framework that\nanalyzes candidates' \"Policies, Persona, and Perspective\" (3P) and how they\nresonate with the \"Interests, Ideologies, and Identity\" (3I) of four key\naudience groups: voters, businesses, donors, and politicians. Our method\nemploys large language models to generate the LLM-POTUS Score, a quantitative\nmeasure of debate performance based on the alignment between 3P and 3I. We\napply this framework to analyze transcripts from recent U.S. presidential\ndebates, demonstrating its ability to provide nuanced, multi-dimensional\nassessments of candidate performances. Our results reveal insights into the\neffectiveness of different debating strategies and their impact on various\naudience segments. This study not only offers a new tool for political analysis\nbut also explores the potential and limitations of using LLMs as impartial\njudges in complex social contexts. In addition, this framework provides\nindividual citizens with an independent tool to evaluate presidential debate\nperformances, which enhances democratic engagement and reduces reliance on\npotentially biased media interpretations and institutional influence, thereby\nstrengthening the foundation of informed civic participation.\n","authors":["Zhengliang Liu","Yiwei Li","Oleksandra Zolotarevych","Rongwei Yang","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2409.08147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06857v2","updated":"2024-09-12T15:04:57Z","published":"2024-09-10T20:45:43Z","title":"What is the Role of Small Models in the LLM Era: A Survey","summary":" Large Language Models (LLMs) have made significant progress in advancing\nartificial general intelligence (AGI), leading to the development of\nincreasingly large models such as GPT-4 and LLaMA-405B. However, scaling up\nmodel sizes results in exponentially higher computational costs and energy\nconsumption, making these models impractical for academic researchers and\nbusinesses with limited resources. At the same time, Small Models (SMs) are\nfrequently used in practical settings, although their significance is currently\nunderestimated. This raises important questions about the role of small models\nin the era of LLMs, a topic that has received limited attention in prior\nresearch. In this work, we systematically examine the relationship between LLMs\nand SMs from two key perspectives: Collaboration and Competition. We hope this\nsurvey provides valuable insights for practitioners, fostering a deeper\nunderstanding of the contribution of small models and promoting more efficient\nuse of computational resources. The code is available at\nhttps://github.com/tigerchen52/role_of_small_models\n","authors":["Lihu Chen","Gaël Varoquaux"],"pdf_url":"https://arxiv.org/pdf/2409.06857v2.pdf","comment":"a survey paper of small models"},{"id":"http://arxiv.org/abs/2409.08107v1","updated":"2024-09-12T15:00:56Z","published":"2024-09-12T15:00:56Z","title":"WhisperNER: Unified Open Named Entity and Speech Recognition","summary":" Integrating named entity recognition (NER) with automatic speech recognition\n(ASR) can significantly enhance transcription accuracy and informativeness. In\nthis paper, we introduce WhisperNER, a novel model that allows joint speech\ntranscription and entity recognition. WhisperNER supports open-type NER,\nenabling recognition of diverse and evolving entities at inference. Building on\nrecent advancements in open NER research, we augment a large synthetic dataset\nwith synthetic speech samples. This allows us to train WhisperNER on a large\nnumber of examples with diverse NER tags. During training, the model is\nprompted with NER labels and optimized to output the transcribed utterance\nalong with the corresponding tagged entities. To evaluate WhisperNER, we\ngenerate synthetic speech for commonly used NER benchmarks and annotate\nexisting ASR datasets with open NER tags. Our experiments demonstrate that\nWhisperNER outperforms natural baselines on both out-of-domain open type NER\nand supervised finetuning.\n","authors":["Gil Ayache","Menachem Pirchi","Aviv Navon","Aviv Shamsian","Gill Hetz","Joseph Keshet"],"pdf_url":"https://arxiv.org/pdf/2409.08107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.03277v6","updated":"2024-09-12T14:56:40Z","published":"2021-02-05T16:35:38Z","title":"Minimum projective linearizations of trees in linear time","summary":" The Minimum Linear Arrangement problem (MLA) consists of finding a mapping\n$\\pi$ from vertices of a graph to distinct integers that minimizes\n$\\sum_{\\{u,v\\}\\in E}|\\pi(u) - \\pi(v)|$. In that setting, vertices are often\nassumed to lie on a horizontal line and edges are drawn as semicircles above\nsaid line. For trees, various algorithms are available to solve the problem in\npolynomial time in $n=|V|$. There exist variants of the MLA in which the\narrangements are constrained. Iordanskii, and later Hochberg and Stallmann\n(HS), put forward $O(n)$-time algorithms that solve the problem when\narrangements are constrained to be planar (also known as one-page book\nembeddings). We also consider linear arrangements of rooted trees that are\nconstrained to be projective (planar embeddings where the root is not covered\nby any edge). Gildea and Temperley (GT) sketched an algorithm for projective\narrangements which they claimed runs in $O(n)$ but did not provide any\njustification of its cost. In contrast, Park and Levy claimed that GT's\nalgorithm runs in $O(n \\log d_{max})$ where $d_{max}$ is the maximum degree but\ndid not provide sufficient detail. Here we correct an error in HS's algorithm\nfor the planar case, show its relationship with the projective case, and derive\nsimple algorithms for the projective and planar cases that run without a doubt\nin $O(n)$ time.\n","authors":["Lluís Alemany-Puig","Juan Luis Esteban","Ramon Ferrer-i-Cancho"],"pdf_url":"https://arxiv.org/pdf/2102.03277v6.pdf","comment":"Here we have corrected a mistake we made in the previous version. In\n particular, line 7 of Algorithm 3.2 used to say: \"For i = 1 to |C_v| ...\"; it\n should be \"For i = 2 to |C_v| ...\" (notice the change from 'i=1' to 'i=2')"},{"id":"http://arxiv.org/abs/2409.02387v3","updated":"2024-09-12T14:56:35Z","published":"2024-09-04T02:30:12Z","title":"Large Language Models and Cognitive Science: A Comprehensive Review of\n Similarities, Differences, and Challenges","summary":" This comprehensive review explores the intersection of Large Language Models\n(LLMs) and cognitive science, examining similarities and differences between\nLLMs and human cognitive processes. We analyze methods for evaluating LLMs\ncognitive abilities and discuss their potential as cognitive models. The review\ncovers applications of LLMs in various cognitive fields, highlighting insights\ngained for cognitive science research. We assess cognitive biases and\nlimitations of LLMs, along with proposed methods for improving their\nperformance. The integration of LLMs with cognitive architectures is examined,\nrevealing promising avenues for enhancing artificial intelligence (AI)\ncapabilities. Key challenges and future research directions are identified,\nemphasizing the need for continued refinement of LLMs to better align with\nhuman cognition. This review provides a balanced perspective on the current\nstate and future potential of LLMs in advancing our understanding of both\nartificial and human intelligence.\n","authors":["Qian Niu","Junyu Liu","Ziqian Bi","Pohsun Feng","Benji Peng","Keyu Chen","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2409.02387v3.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.08103v1","updated":"2024-09-12T14:55:33Z","published":"2024-09-12T14:55:33Z","title":"The Faetar Benchmark: Speech Recognition in a Very Under-Resourced\n Language","summary":" We introduce the Faetar Automatic Speech Recognition Benchmark, a benchmark\ncorpus designed to push the limits of current approaches to low-resource speech\nrecognition. Faetar, a Franco-Proven\\c{c}al variety spoken primarily in Italy,\nhas no standard orthography, has virtually no existing textual or speech\nresources other than what is included in the benchmark, and is quite different\nfrom other forms of Franco-Proven\\c{c}al. The corpus comes from field\nrecordings, most of which are noisy, for which only 5 hrs have matching\ntranscriptions, and for which forced alignment is of variable quality. The\ncorpus contains an additional 20 hrs of unlabelled speech. We report baseline\nresults from state-of-the-art multilingual speech foundation models with a best\nphone error rate of 30.4%, using a pipeline that continues pre-training on the\nfoundation model using the unlabelled set.\n","authors":["Michael Ong","Sean Robertson","Leo Peckham","Alba Jorquera Jimenez de Aberasturi","Paula Arkhangorodsky","Robin Huo","Aman Sakhardande","Mark Hallap","Naomi Nagy","Ewan Dunbar"],"pdf_url":"https://arxiv.org/pdf/2409.08103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08098v1","updated":"2024-09-12T14:51:43Z","published":"2024-09-12T14:51:43Z","title":"The CLC-UKET Dataset: Benchmarking Case Outcome Prediction for the UK\n Employment Tribunal","summary":" This paper explores the intersection of technological innovation and access\nto justice by developing a benchmark for predicting case outcomes in the UK\nEmployment Tribunal (UKET). To address the challenge of extensive manual\nannotation, the study employs a large language model (LLM) for automatic\nannotation, resulting in the creation of the CLC-UKET dataset. The dataset\nconsists of approximately 19,000 UKET cases and their metadata. Comprehensive\nlegal annotations cover facts, claims, precedent references, statutory\nreferences, case outcomes, reasons and jurisdiction codes. Facilitated by the\nCLC-UKET data, we examine a multi-class case outcome prediction task in the\nUKET. Human predictions are collected to establish a performance reference for\nmodel comparison. Empirical results from baseline models indicate that\nfinetuned transformer models outperform zero-shot and few-shot LLMs on the UKET\nprediction task. The performance of zero-shot LLMs can be enhanced by\nintegrating task-related information into few-shot examples. We hope that the\nCLC-UKET dataset, along with human annotations and empirical findings, can\nserve as a valuable benchmark for employment-related dispute resolution.\n","authors":["Huiyuan Xie","Felix Steffek","Joana Ribeiro de Faria","Christine Carter","Jonathan Rutherford"],"pdf_url":"https://arxiv.org/pdf/2409.08098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16570v3","updated":"2024-09-12T14:40:09Z","published":"2024-08-29T14:37:05Z","title":"Predictability maximization and the origins of word order harmony","summary":" We address the linguistic problem of the sequential arrangement of a head and\nits dependents from an information theoretic perspective. In particular, we\nconsider the optimal placement of a head that maximizes the predictability of\nthe sequence. We assume that dependents are statistically independent given a\nhead, in line with the open-choice principle and the core assumptions of\ndependency grammar. We demonstrate the optimality of harmonic order, i.e.,\nplacing the head last maximizes the predictability of the head whereas placing\nthe head first maximizes the predictability of dependents. We also show that\npostponing the head is the optimal strategy to maximize its predictability\nwhile bringing it forward is the optimal strategy to maximize the\npredictability of dependents. We unravel the advantages of the strategy of\nmaximizing the predictability of the head over maximizing the predictability of\ndependents. Our findings shed light on the placements of the head adopted by\nreal languages or emerging in different kinds of experiments.\n","authors":["Ramon Ferrer-i-Cancho"],"pdf_url":"https://arxiv.org/pdf/2408.16570v3.pdf","comment":"Typos corrected; new references added"},{"id":"http://arxiv.org/abs/2409.08069v1","updated":"2024-09-12T14:24:45Z","published":"2024-09-12T14:24:45Z","title":"TravelAgent: An AI Assistant for Personalized Travel Planning","summary":" As global tourism expands and artificial intelligence technology advances,\nintelligent travel planning services have emerged as a significant research\nfocus. Within dynamic real-world travel scenarios with multi-dimensional\nconstraints, services that support users in automatically creating practical\nand customized travel itineraries must address three key objectives:\nRationality, Comprehensiveness, and Personalization. However, existing systems\nwith rule-based combinations or LLM-based planning methods struggle to fully\nsatisfy these criteria. To overcome the challenges, we introduce TravelAgent, a\ntravel planning system powered by large language models (LLMs) designed to\nprovide reasonable, comprehensive, and personalized travel itineraries grounded\nin dynamic scenarios. TravelAgent comprises four modules: Tool-usage,\nRecommendation, Planning, and Memory Module. We evaluate TravelAgent's\nperformance with human and simulated users, demonstrating its overall\neffectiveness in three criteria and confirming the accuracy of personalized\nrecommendations.\n","authors":["Aili Chen","Xuyang Ge","Ziquan Fu","Yanghua Xiao","Jiangjie Chen"],"pdf_url":"https://arxiv.org/pdf/2409.08069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03302v4","updated":"2024-09-12T11:51:51Z","published":"2024-04-04T08:52:30Z","title":"How Easily do Irrelevant Inputs Skew the Responses of Large Language\n Models?","summary":" By leveraging the retrieval of information from external knowledge databases,\nLarge Language Models (LLMs) exhibit enhanced capabilities for accomplishing\nmany knowledge-intensive tasks. However, due to the inherent flaws of current\nretrieval systems, there might exist irrelevant information within those\nretrieving top-ranked passages. In this work, we present a comprehensive\ninvestigation into the robustness of LLMs to different types of irrelevant\ninformation under various conditions. We initially introduce a framework to\nconstruct high-quality irrelevant information that ranges from semantically\nunrelated, partially related, and related to questions. Furthermore, our\nanalysis demonstrates that the constructed irrelevant information not only\nscores highly on similarity metrics, being highly retrieved by existing\nsystems, but also bears semantic connections to the context. Our investigation\nreveals that current LLMs still face challenges in discriminating highly\nsemantically related information and can be easily distracted by these\nirrelevant yet misleading content. Besides, we also find that current solutions\nfor handling irrelevant information have limitations in improving the\nrobustness of LLMs to such distractions. All the resources are available on\nGitHub at https://github.com/Di-viner/LLM-Robustness-to-Irrelevant-Information.\n","authors":["Siye Wu","Jian Xie","Jiangjie Chen","Tinghui Zhu","Kai Zhang","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.03302v4.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2409.07958v1","updated":"2024-09-12T11:37:34Z","published":"2024-09-12T11:37:34Z","title":"Enhanced Online Grooming Detection Employing Context Determination and\n Message-Level Analysis","summary":" Online Grooming (OG) is a prevalent threat facing predominately children\nonline, with groomers using deceptive methods to prey on the vulnerability of\nchildren on social media/messaging platforms. These attacks can have severe\npsychological and physical impacts, including a tendency towards\nrevictimization. Current technical measures are inadequate, especially with the\nadvent of end-to-end encryption which hampers message monitoring. Existing\nsolutions focus on the signature analysis of child abuse media, which does not\neffectively address real-time OG detection. This paper proposes that OG attacks\nare complex, requiring the identification of specific communication patterns\nbetween adults and children. It introduces a novel approach leveraging advanced\nmodels such as BERT and RoBERTa for Message-Level Analysis and a Context\nDetermination approach for classifying actor interactions, including the\nintroduction of Actor Significance Thresholds and Message Significance\nThresholds. The proposed method aims to enhance accuracy and robustness in\ndetecting OG by considering the dynamic and multi-faceted nature of these\nattacks. Cross-dataset experiments evaluate the robustness and versatility of\nour approach. This paper's contributions include improved detection\nmethodologies and the potential for application in various scenarios,\naddressing gaps in current literature and practices.\n","authors":["Jake Street","Isibor Ihianle","Funminiyi Olajide","Ahmad Lotfi"],"pdf_url":"https://arxiv.org/pdf/2409.07958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12091v4","updated":"2024-09-12T11:26:39Z","published":"2022-01-28T13:00:17Z","title":"Linear Adversarial Concept Erasure","summary":" Modern neural models trained on textual data rely on pre-trained\nrepresentations that emerge without direct supervision. As these\nrepresentations are increasingly being used in real-world applications, the\ninability to \\emph{control} their content becomes an increasingly important\nproblem. We formulate the problem of identifying and erasing a linear subspace\nthat corresponds to a given concept, in order to prevent linear predictors from\nrecovering the concept. We model this problem as a constrained, linear maximin\ngame, and show that existing solutions are generally not optimal for this task.\nWe derive a closed-form solution for certain objectives, and propose a convex\nrelaxation, \\method, that works well for others. When evaluated in the context\nof binary gender removal, the method recovers a low-dimensional subspace whose\nremoval mitigates bias by intrinsic and extrinsic evaluation. We show that the\nmethod is highly expressive, effectively mitigating bias in deep nonlinear\nclassifiers while maintaining tractability and interpretability.\n","authors":["Shauli Ravfogel","Michael Twiton","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2201.12091v4.pdf","comment":"Accepted in ICML 2022; a revised version"},{"id":"http://arxiv.org/abs/2407.14133v2","updated":"2024-09-12T11:17:46Z","published":"2024-07-19T09:03:30Z","title":"I Know About \"Up\"! Enhancing Spatial Reasoning in Visual Language Models\n Through 3D Reconstruction","summary":" Visual Language Models (VLMs) are essential for various tasks, particularly\nvisual reasoning tasks, due to their robust multi-modal information\nintegration, visual reasoning capabilities, and contextual awareness. However,\nexisting \\VLMs{}' visual spatial reasoning capabilities are often inadequate,\nstruggling even with basic tasks such as distinguishing left from right. To\naddress this, we propose the \\ours{} model, designed to enhance the visual\nspatial reasoning abilities of VLMS. ZeroVLM employs Zero-1-to-3, a 3D\nreconstruction model for obtaining different views of the input images and\nincorporates a prompting mechanism to further improve visual spatial reasoning.\nExperimental results on four visual spatial reasoning datasets show that our\n\\ours{} achieves up to 19.48% accuracy improvement, which indicates the\neffectiveness of the 3D reconstruction and prompting mechanisms of our ZeroVLM.\n","authors":["Zaiqiao Meng","Hao Zhou","Yifang Chen"],"pdf_url":"https://arxiv.org/pdf/2407.14133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07891v1","updated":"2024-09-12T09:51:56Z","published":"2024-09-12T09:51:56Z","title":"A corpus-based investigation of pitch contours of monosyllabic words in\n conversational Taiwan Mandarin","summary":" In Mandarin, the tonal contours of monosyllabic words produced in isolation\nor in careful speech are characterized by four lexical tones: a high-level tone\n(T1), a rising tone (T2), a dipping tone (T3) and a falling tone (T4). However,\nin spontaneous speech, the actual tonal realization of monosyllabic words can\ndeviate significantly from these canonical tones due to intra-syllabic\nco-articulation and inter-syllabic co-articulation with adjacent tones. In\naddition, Chuang et al. (2024) recently reported that the tonal contours of\ndisyllabic Mandarin words with T2-T4 tone pattern are co-determined by their\nmeanings. Following up on their research, we present a corpus-based\ninvestigation of how the pitch contours of monosyllabic words are realized in\nspontaneous conversational Mandarin, focusing on the effects of contextual\npredictors on the one hand, and the way in words' meanings co-determine pitch\ncontours on the other hand. We analyze the F0 contours of 3824 tokens of 63\ndifferent word types in a spontaneous Taiwan Mandarin corpus, using the\ngeneralized additive (mixed) model to decompose a given observed pitch contour\ninto a set of component pitch contours. We show that the tonal context\nsubstantially modify a word's canonical tone. Once the effect of tonal context\nis controlled for, T2 and T3 emerge as low flat tones, contrasting with T1 as a\nhigh tone, and with T4 as a high-to-mid falling tone. The neutral tone (T0),\nwhich in standard descriptions, is realized based on the preceding tone,\nemerges as a low tone in its own right, modified by the other predictors in the\nsame way as the standard tones T1, T2, T3, and T4. We also show that word, and\neven more so, word sense, co-determine words' F0 contours. Analyses of variable\nimportance using random forests further supported the substantial effect of\ntonal context and an effect of word sense.\n","authors":["Xiaoyun Jin","Mirjam Ernestus","R. Harald Baayen"],"pdf_url":"https://arxiv.org/pdf/2409.07891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07869v1","updated":"2024-09-12T09:27:36Z","published":"2024-09-12T09:27:36Z","title":"Learning Rules from KGs Guided by Language Models","summary":" Advances in information extraction have enabled the automatic construction of\nlarge knowledge graphs (e.g., Yago, Wikidata or Google KG), which are widely\nused in many applications like semantic search or data analytics. However, due\nto their semi-automatic construction, KGs are often incomplete. Rule learning\nmethods, concerned with the extraction of frequent patterns from KGs and\ncasting them into rules, can be applied to predict potentially missing facts. A\ncrucial step in this process is rule ranking. Ranking of rules is especially\nchallenging over highly incomplete or biased KGs (e.g., KGs predominantly\nstoring facts about famous people), as in this case biased rules might fit the\ndata best and be ranked at the top based on standard statistical metrics like\nrule confidence. To address this issue, prior works proposed to rank rules not\nonly relying on the original KG but also facts predicted by a KG embedding\nmodel. At the same time, with the recent rise of Language Models (LMs), several\nworks have claimed that LMs can be used as alternative means for KG completion.\nIn this work, our goal is to verify to which extent the exploitation of LMs is\nhelpful for improving the quality of rule learning systems.\n","authors":["Zihang Peng","Daria Stepanova","Vinh Thinh Ho","Heike Adel","Alessandra Russo","Simon Ott"],"pdf_url":"https://arxiv.org/pdf/2409.07869v1.pdf","comment":"proof of concept"},{"id":"http://arxiv.org/abs/2409.07839v1","updated":"2024-09-12T08:38:42Z","published":"2024-09-12T08:38:42Z","title":"FPMT: Enhanced Semi-Supervised Model for Traffic Incident Detection","summary":" For traffic incident detection, the acquisition of data and labels is notably\nresource-intensive, rendering semi-supervised traffic incident detection both a\nformidable and consequential challenge. Thus, this paper focuses on traffic\nincident detection with a semi-supervised learning way. It proposes a\nsemi-supervised learning model named FPMT within the framework of MixText. The\ndata augmentation module introduces Generative Adversarial Networks to balance\nand expand the dataset. During the mix-up process in the hidden space, it\nemploys a probabilistic pseudo-mixing mechanism to enhance regularization and\nelevate model precision. In terms of training strategy, it initiates with\nunsupervised training on all data, followed by supervised fine-tuning on a\nsubset of labeled data, and ultimately completing the goal of semi-supervised\ntraining. Through empirical validation on four authentic datasets, our FPMT\nmodel exhibits outstanding performance across various metrics. Particularly\nnoteworthy is its robust performance even in scenarios with low label rates.\n","authors":["Xinying Lu","Jianli Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.07839v1.pdf","comment":"14 pages, 3 figures, accepted by ICPR 2024"},{"id":"http://arxiv.org/abs/2407.13696v2","updated":"2024-09-12T08:36:47Z","published":"2024-07-18T17:00:23Z","title":"Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with\n BenchBench","summary":" Recent advancements in Language Models (LMs) have catalyzed the creation of\nmultiple benchmarks, designed to assess these models' general capabilities. A\ncrucial task, however, is assessing the validity of the benchmarks themselves.\nThis is most commonly done via Benchmark Agreement Testing (BAT), where new\nbenchmarks are validated against established ones using some agreement metric\n(e.g., rank correlation). Despite the crucial role of BAT for benchmark\nbuilders and consumers, there are no standardized procedures for such agreement\ntesting. This deficiency can lead to invalid conclusions, fostering mistrust in\nbenchmarks and upending the ability to properly choose the appropriate\nbenchmark to use. By analyzing over 40 prominent benchmarks, we demonstrate how\nsome overlooked methodological choices can significantly influence BAT results,\npotentially undermining the validity of conclusions. To address these\ninconsistencies, we propose a set of best practices for BAT and demonstrate how\nutilizing these methodologies greatly improves BAT robustness and validity. To\nfoster adoption and facilitate future research,, we introduce BenchBench, a\npython package for BAT, and release the BenchBench-leaderboard, a\nmeta-benchmark designed to evaluate benchmarks using their peers. Our findings\nunderscore the necessity for standardized BAT, ensuring the robustness and\nvalidity of benchmark evaluations in the evolving landscape of language model\nresearch.\n BenchBench Package: github.com/IBM/BenchBench\n Leaderboard: hf.co/spaces/IBM/BenchBench\n","authors":["Yotam Perlitz","Ariel Gera","Ofir Arviv","Asaf Yehudai","Elron Bandel","Eyal Shnarch","Michal Shmueli-Scheuer","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2407.13696v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2409.07823v1","updated":"2024-09-12T08:11:08Z","published":"2024-09-12T08:11:08Z","title":"Online vs Offline: A Comparative Study of First-Party and Third-Party\n Evaluations of Social Chatbots","summary":" This paper explores the efficacy of online versus offline evaluation methods\nin assessing conversational chatbots, specifically comparing first-party direct\ninteractions with third-party observational assessments. By extending a\nbenchmarking dataset of user dialogs with empathetic chatbots with offline\nthird-party evaluations, we present a systematic comparison between the\nfeedback from online interactions and the more detached offline third-party\nevaluations. Our results reveal that offline human evaluations fail to capture\nthe subtleties of human-chatbot interactions as effectively as online\nassessments. In comparison, automated third-party evaluations using a GPT-4\nmodel offer a better approximation of first-party human judgments given\ndetailed instructions. This study highlights the limitations of third-party\nevaluations in grasping the complexities of user experiences and advocates for\nthe integration of direct interaction feedback in conversational AI evaluation\nto enhance system development and user satisfaction.\n","authors":["Ekaterina Svikhnushina","Pearl Pu"],"pdf_url":"https://arxiv.org/pdf/2409.07823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.14603v3","updated":"2024-09-12T08:08:08Z","published":"2022-03-28T09:28:13Z","title":"The SAME score: Improved cosine based bias score for word embeddings","summary":" With the enourmous popularity of large language models, many researchers have\nraised ethical concerns regarding social biases incorporated in such models.\nSeveral methods to measure social bias have been introduced, but apparently\nthese methods do not necessarily agree regarding the presence or severity of\nbias. Furthermore, some works have shown theoretical issues or severe\nlimitations with certain bias measures. For that reason, we introduce SAME, a\nnovel bias score for semantic bias in embeddings. We conduct a thorough\ntheoretical analysis as well as experiments to show its benefits compared to\nsimilar bias scores from the literature. We further highlight a substantial\nrelation of semantic bias measured by SAME with downstream bias, a connection\nthat has recently been argued to be negligible. Instead, we show that SAME is\ncapable of measuring semantic bias and identify potential causes for social\nbias in downstream tasks.\n","authors":["Sarah Schröder","Alexander Schulz","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2203.14603v3.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.15499v2","updated":"2024-09-12T08:07:27Z","published":"2024-01-27T20:31:10Z","title":"Semantic Properties of cosine based bias scores for word embeddings","summary":" Plenty of works have brought social biases in language models to attention\nand proposed methods to detect such biases. As a result, the literature\ncontains a great deal of different bias tests and scores, each introduced with\nthe premise to uncover yet more biases that other scores fail to detect. What\nseverely lacks in the literature, however, are comparative studies that analyse\nsuch bias scores and help researchers to understand the benefits or limitations\nof the existing methods. In this work, we aim to close this gap for cosine\nbased bias scores. By building on a geometric definition of bias, we propose\nrequirements for bias scores to be considered meaningful for quantifying\nbiases. Furthermore, we formally analyze cosine based scores from the\nliterature with regard to these requirements. We underline these findings with\nexperiments to show that the bias scores' limitations have an impact in the\napplication case.\n","authors":["Sarah Schröder","Alexander Schulz","Fabian Hinder","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2401.15499v2.pdf","comment":"11 pages, 3 figures. arXiv admin note: text overlap with\n arXiv:2111.07864"},{"id":"http://arxiv.org/abs/2111.07864v2","updated":"2024-09-12T08:04:04Z","published":"2021-11-15T16:07:15Z","title":"Evaluating Metrics for Bias in Word Embeddings","summary":" Over the last years, word and sentence embeddings have established as text\npreprocessing for all kinds of NLP tasks and improved the performances\nsignificantly. Unfortunately, it has also been shown that these embeddings\ninherit various kinds of biases from the training data and thereby pass on\nbiases present in society to NLP solutions. Many papers attempted to quantify\nbias in word or sentence embeddings to evaluate debiasing methods or compare\ndifferent embedding models, usually with cosine-based metrics. However, lately\nsome works have raised doubts about these metrics showing that even though such\nmetrics report low biases, other tests still show biases. In fact, there is a\ngreat variety of bias metrics or tests proposed in the literature without any\nconsensus on the optimal solutions. Yet we lack works that evaluate bias\nmetrics on a theoretical level or elaborate the advantages and disadvantages of\ndifferent bias metrics. In this work, we will explore different cosine based\nbias metrics. We formalize a bias definition based on the ideas from previous\nworks and derive conditions for bias metrics. Furthermore, we thoroughly\ninvestigate the existing cosine-based metrics and their limitations to show why\nthese metrics can fail to report biases in some cases. Finally, we propose a\nnew metric, SAME, to address the shortcomings of existing metrics and\nmathematically prove that SAME behaves appropriately.\n","authors":["Sarah Schröder","Alexander Schulz","Philip Kenneweg","Robert Feldhans","Fabian Hinder","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2111.07864v2.pdf","comment":"32 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.07809v1","updated":"2024-09-12T07:38:34Z","published":"2024-09-12T07:38:34Z","title":"Controllable Synthetic Clinical Note Generation with Privacy Guarantees","summary":" In the field of machine learning, domain-specific annotated data is an\ninvaluable resource for training effective models. However, in the medical\ndomain, this data often includes Personal Health Information (PHI), raising\nsignificant privacy concerns. The stringent regulations surrounding PHI limit\nthe availability and sharing of medical datasets, which poses a substantial\nchallenge for researchers and practitioners aiming to develop advanced machine\nlearning models. In this paper, we introduce a novel method to \"clone\" datasets\ncontaining PHI. Our approach ensures that the cloned datasets retain the\nessential characteristics and utility of the original data without compromising\npatient privacy. By leveraging differential-privacy techniques and a novel\nfine-tuning task, our method produces datasets that are free from identifiable\ninformation while preserving the statistical properties necessary for model\ntraining. We conduct utility testing to evaluate the performance of machine\nlearning models trained on the cloned datasets. The results demonstrate that\nour cloned datasets not only uphold privacy standards but also enhance model\nperformance compared to those trained on traditional anonymized datasets. This\nwork offers a viable solution for the ethical and effective utilization of\nsensitive medical data in machine learning, facilitating progress in medical\nresearch and the development of robust predictive models.\n","authors":["Tal Baumel","Andre Manoel","Daniel Jones","Shize Su","Huseyin Inan"," Aaron"," Bornstein","Robert Sim"],"pdf_url":"https://arxiv.org/pdf/2409.07809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07790v1","updated":"2024-09-12T06:50:45Z","published":"2024-09-12T06:50:45Z","title":"Full-text Error Correction for Chinese Speech Recognition with Large\n Language Model","summary":" Large Language Models (LLMs) have demonstrated substantial potential for\nerror correction in Automatic Speech Recognition (ASR). However, most research\nfocuses on utterances from short-duration speech recordings, which are the\npredominant form of speech data for supervised ASR training. This paper\ninvestigates the effectiveness of LLMs for error correction in full-text\ngenerated by ASR systems from longer speech recordings, such as transcripts\nfrom podcasts, news broadcasts, and meetings. First, we develop a Chinese\ndataset for full-text error correction, named ChFT, utilizing a pipeline that\ninvolves text-to-speech synthesis, ASR, and error-correction pair extractor.\nThis dataset enables us to correct errors across contexts, including both\nfull-text and segment, and to address a broader range of error types, such as\npunctuation restoration and inverse text normalization, thus making the\ncorrection process comprehensive. Second, we fine-tune a pre-trained LLM on the\nconstructed dataset using a diverse set of prompts and target formats, and\nevaluate its performance on full-text error correction. Specifically, we design\nprompts based on full-text and segment, considering various output formats,\nsuch as directly corrected text and JSON-based error-correction pairs. Through\nvarious test settings, including homogeneous, up-to-date, and hard test sets,\nwe find that the fine-tuned LLMs perform well in the full-text setting with\ndifferent prompts, each presenting its own strengths and weaknesses. This\nestablishes a promising baseline for further research. The dataset is available\non the website.\n","authors":["Zhiyuan Tang","Dong Wang","Shen Huang","Shidong Shang"],"pdf_url":"https://arxiv.org/pdf/2409.07790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07787v1","updated":"2024-09-12T06:37:46Z","published":"2024-09-12T06:37:46Z","title":"Stable Language Model Pre-training by Reducing Embedding Variability","summary":" Stable pre-training is essential for achieving better-performing language\nmodels. However, tracking pre-training stability by calculating gradient\nvariance at every step is impractical due to the significant computational\ncosts. We explore Token Embedding Variability (TEV) as a simple and efficient\nproxy for assessing pre-training stability in language models with pre-layer\nnormalization, given that shallower layers are more prone to gradient explosion\n(section 2.2). Moreover, we propose Multi-head Low-Rank Attention (MLRA) as an\narchitecture to alleviate such instability by limiting the exponential growth\nof output embedding variance, thereby preventing the gradient explosion\n(section 3.2). Empirical results on GPT-2 with MLRA demonstrate increased\nstability and lower perplexity, particularly in deeper models.\n","authors":["Woojin Chung","Jiwoo Hong","Na Min An","James Thorne","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2409.07787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07780v1","updated":"2024-09-12T06:27:35Z","published":"2024-09-12T06:27:35Z","title":"Supporting Online Discussions: Integrating AI Into the adhocracy+\n Participation Platform To Enhance Deliberation","summary":" Online spaces allow people to discuss important issues and make joint\ndecisions, regardless of their location or time zone. However, without proper\nsupport and thoughtful design, these discussions often lack structure and\npoliteness during the exchanges of opinions. Artificial intelligence (AI)\nrepresents an opportunity to support both participants and organizers of\nlarge-scale online participation processes. In this paper, we present an\nextension of adhocracy+, a large-scale open source participation platform, that\nprovides two additional debate modules that are supported by AI to enhance the\ndiscussion quality and participant interaction.\n","authors":["Maike Behrendt","Stefan Sylvius Wagner","Stefan Harmeling"],"pdf_url":"https://arxiv.org/pdf/2409.07780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10741v3","updated":"2024-09-12T05:36:06Z","published":"2023-12-17T15:26:16Z","title":"StyleSinger: Style Transfer for Out-of-Domain Singing Voice Synthesis","summary":" Style transfer for out-of-domain (OOD) singing voice synthesis (SVS) focuses\non generating high-quality singing voices with unseen styles (such as timbre,\nemotion, pronunciation, and articulation skills) derived from reference singing\nvoice samples. However, the endeavor to model the intricate nuances of singing\nvoice styles is an arduous task, as singing voices possess a remarkable degree\nof expressiveness. Moreover, existing SVS methods encounter a decline in the\nquality of synthesized singing voices in OOD scenarios, as they rest upon the\nassumption that the target vocal attributes are discernible during the training\nphase. To overcome these challenges, we propose StyleSinger, the first singing\nvoice synthesis model for zero-shot style transfer of out-of-domain reference\nsinging voice samples. StyleSinger incorporates two critical approaches for\nenhanced effectiveness: 1) the Residual Style Adaptor (RSA) which employs a\nresidual quantization module to capture diverse style characteristics in\nsinging voices, and 2) the Uncertainty Modeling Layer Normalization (UMLN) to\nperturb the style attributes within the content representation during the\ntraining phase and thus improve the model generalization. Our extensive\nevaluations in zero-shot style transfer undeniably establish that StyleSinger\noutperforms baseline models in both audio quality and similarity to the\nreference singing voice samples. Access to singing voice samples can be found\nat https://stylesinger.github.io/.\n","authors":["Yu Zhang","Rongjie Huang","Ruiqi Li","JinZheng He","Yan Xia","Feiyang Chen","Xinyu Duan","Baoxing Huai","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.10741v3.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.03426v2","updated":"2024-09-12T04:47:33Z","published":"2024-01-07T09:06:58Z","title":"On Leveraging Large Language Models for Enhancing Entity Resolution: A\n Cost-efficient Approach","summary":" Entity resolution, the task of identifying and merging records that refer to\nthe same real-world entity, is crucial in sectors like e-commerce, healthcare,\nand law enforcement. Large Language Models (LLMs) introduce an innovative\napproach to this task, capitalizing on their advanced linguistic capabilities\nand a ``pay-as-you-go'' model that provides significant advantages to those\nwithout extensive data science expertise. However, current LLMs are costly due\nto per-API request billing. Existing methods often either lack quality or\nbecome prohibitively expensive at scale. To address these problems, we propose\nan uncertainty reduction framework using LLMs to improve entity resolution\nresults. We first initialize possible partitions of the entity cluster, refer\nto the same entity, and define the uncertainty of the result. Then, we reduce\nthe uncertainty by selecting a few valuable matching questions for LLM\nverification. Upon receiving the answers, we update the probability\ndistribution of the possible partitions. To further reduce costs, we design an\nefficient algorithm to judiciously select the most valuable matching pairs to\nquery. Additionally, we create error-tolerant techniques to handle LLM mistakes\nand a dynamic adjustment method to reach truly correct partitions. Experimental\nresults show that our method is efficient and effective, offering promising\napplications in real-world tasks.\n","authors":["Huahang Li","Longyu Feng","Shuangyin Li","Fei Hao","Chen Jason Zhang","Yuanfeng Song"],"pdf_url":"https://arxiv.org/pdf/2401.03426v2.pdf","comment":"9 pages, preprint under review"},{"id":"http://arxiv.org/abs/2409.07748v1","updated":"2024-09-12T04:43:27Z","published":"2024-09-12T04:43:27Z","title":"Top-down Activity Representation Learning for Video Question Answering","summary":" Capturing complex hierarchical human activities, from atomic actions (e.g.,\npicking up one present, moving to the sofa, unwrapping the present) to\ncontextual events (e.g., celebrating Christmas) is crucial for achieving\nhigh-performance video question answering (VideoQA). Recent works have expanded\nmultimodal models (e.g., CLIP, LLaVA) to process continuous video sequences,\nenhancing the model's temporal reasoning capabilities. However, these\napproaches often fail to capture contextual events that can be decomposed into\nmultiple atomic actions non-continuously distributed over relatively long-term\nsequences. In this paper, to leverage the spatial visual context representation\ncapability of the CLIP model for obtaining non-continuous visual\nrepresentations in terms of contextual events in videos, we convert long-term\nvideo sequences into a spatial image domain and finetune the multimodal model\nLLaVA for the VideoQA task. Our approach achieves competitive performance on\nthe STAR task, in particular, with a 78.4% accuracy score, exceeding the\ncurrent state-of-the-art score by 2.8 points on the NExTQA task.\n","authors":["Yanan Wang","Shuichiro Haruta","Donghuo Zeng","Julio Vizcarra","Mori Kurokawa"],"pdf_url":"https://arxiv.org/pdf/2409.07748v1.pdf","comment":"presented at MIRU2024"},{"id":"http://arxiv.org/abs/2409.07747v1","updated":"2024-09-12T04:42:51Z","published":"2024-09-12T04:42:51Z","title":"Multi-object event graph representation learning for Video Question\n Answering","summary":" Video question answering (VideoQA) is a task to predict the correct answer to\nquestions posed about a given video. The system must comprehend spatial and\ntemporal relationships among objects extracted from videos to perform causal\nand temporal reasoning. While prior works have focused on modeling individual\nobject movements using transformer-based methods, they falter when capturing\ncomplex scenarios involving multiple objects (e.g., \"a boy is throwing a ball\nin a hoop\"). We propose a contrastive language event graph representation\nlearning method called CLanG to address this limitation. Aiming to capture\nevent representations associated with multiple objects, our method employs a\nmulti-layer GNN-cluster module for adversarial graph representation learning,\nenabling contrastive learning between the question text and its relevant\nmulti-object event graph. Our method outperforms a strong baseline, achieving\nup to 2.2% higher accuracy on two challenging VideoQA datasets, NExT-QA and\nTGIF-QA-R. In particular, it is 2.8% better than baselines in handling causal\nand temporal questions, highlighting its strength in reasoning multiple\nobject-based events.\n","authors":["Yanan Wang","Shuichiro Haruta","Donghuo Zeng","Julio Vizcarra","Mori Kurokawa"],"pdf_url":"https://arxiv.org/pdf/2409.07747v1.pdf","comment":"presented at MIRU2024"},{"id":"http://arxiv.org/abs/2409.07737v1","updated":"2024-09-12T04:06:31Z","published":"2024-09-12T04:06:31Z","title":"Ruri: Japanese General Text Embeddings","summary":" We report the development of Ruri, a series of Japanese general text\nembedding models. While the development of general-purpose text embedding\nmodels in English and multilingual contexts has been active in recent years,\nmodel development in Japanese remains insufficient. The primary reasons for\nthis are the lack of datasets and the absence of necessary expertise. In this\nreport, we provide a detailed account of the development process of Ruri.\nSpecifically, we discuss the training of embedding models using synthesized\ndatasets generated by LLMs, the construction of the reranker for dataset\nfiltering and knowledge distillation, and the performance evaluation of the\nresulting general-purpose text embedding models.\n","authors":["Hayato Tsukagoshi","Ryohei Sasano"],"pdf_url":"https://arxiv.org/pdf/2409.07737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07732v1","updated":"2024-09-12T03:41:39Z","published":"2024-09-12T03:41:39Z","title":"Large Language Models are Pattern Matchers: Editing Semi-Structured and\n Structured Documents with ChatGPT","summary":" Large Language Models (LLMs) offer numerous applications, the full extent of\nwhich is not yet understood. This paper investigates if LLMs can be applied for\nediting structured and semi-structured documents with minimal effort. Using a\nqualitative research approach, we conduct two case studies with ChatGPT and\nthoroughly analyze the results. Our experiments indicate that LLMs can\neffectively edit structured and semi-structured documents when provided with\nbasic, straightforward prompts. ChatGPT demonstrates a strong ability to\nrecognize and process the structure of annotated documents. This suggests that\nexplicitly structuring tasks and data in prompts might enhance an LLM's ability\nto understand and solve tasks. Furthermore, the experiments also reveal\nimpressive pattern matching skills in ChatGPT. This observation deserves\nfurther investigation, as it may contribute to understanding the processes\nleading to hallucinations in LLMs.\n","authors":["Irene Weber"],"pdf_url":"https://arxiv.org/pdf/2409.07732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18977v3","updated":"2024-09-12T03:24:58Z","published":"2024-06-27T08:13:33Z","title":"RoboUniView: Visual-Language Model with Unified View Representation for\n Robotic Manipulation","summary":" Utilizing Vision-Language Models (VLMs) for robotic manipulation represents a\nnovel paradigm, aiming to enhance the model's ability to generalize to new\nobjects and instructions. However, due to variations in camera specifications\nand mounting positions, existing methods exhibit significant performance\ndisparities across different robotic platforms. To address this challenge, we\npropose RoboUniView in this paper, an innovative approach that decouples visual\nfeature extraction from action learning. We first learn a unified view\nrepresentation from multi-perspective views by pre-training on readily\naccessible data, and then derive actions from this unified view representation\nto control robotic manipulation. This unified view representation more\naccurately mirrors the physical world and is not constrained by the robotic\nplatform's camera parameters. Thanks to this methodology, we achieve\nstate-of-the-art performance on the demanding CALVIN benchmark, enhancing the\nsuccess rate in the $D \\to D$ setting from 93.0% to 96.2%, and in the $ABC \\to\nD$ setting from 92.2% to 94.2%. Moreover, our model exhibits outstanding\nadaptability and flexibility: it maintains high performance under unseen camera\nparameters, can utilize multiple datasets with varying camera parameters, and\nis capable of joint cross-task learning across datasets. Code is provided for\nre-implementation. https://github.com/liufanfanlff/RoboUniview\n","authors":["Fanfan Liu","Feng Yan","Liming Zheng","Chengjian Feng","Yiyang Huang","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2406.18977v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07713v1","updated":"2024-09-12T02:40:28Z","published":"2024-09-12T02:40:28Z","title":"Experimenting with Legal AI Solutions: The Case of Question-Answering\n for Access to Justice","summary":" Generative AI models, such as the GPT and Llama series, have significant\npotential to assist laypeople in answering legal questions. However, little\nprior work focuses on the data sourcing, inference, and evaluation of these\nmodels in the context of laypersons. To this end, we propose a human-centric\nlegal NLP pipeline, covering data sourcing, inference, and evaluation. We\nintroduce and release a dataset, LegalQA, with real and specific legal\nquestions spanning from employment law to criminal law, corresponding answers\nwritten by legal experts, and citations for each answer. We develop an\nautomatic evaluation protocol for this dataset, then show that\nretrieval-augmented generation from only 850 citations in the train set can\nmatch or outperform internet-wide retrieval, despite containing 9 orders of\nmagnitude less data. Finally, we propose future directions for open-sourced\nefforts, which fall behind closed-sourced models.\n","authors":["Jonathan Li","Rohan Bhambhoria","Samuel Dahan","Xiaodan Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.07713v1.pdf","comment":"Accepted into GenLaw '24 (ICML 2024 workshop)"},{"id":"http://arxiv.org/abs/2409.07703v1","updated":"2024-09-12T02:08:00Z","published":"2024-09-12T02:08:00Z","title":"DSBench: How Far Are Data Science Agents to Becoming Data Science\n Experts?","summary":" Large Language Models (LLMs) and Large Vision-Language Models (LVLMs) have\ndemonstrated impressive language/vision reasoning abilities, igniting the\nrecent trend of building agents for targeted applications such as shopping\nassistants or AI software engineers. Recently, many data science benchmarks\nhave been proposed to investigate their performance in the data science domain.\nHowever, existing data science benchmarks still fall short when compared to\nreal-world data science applications due to their simplified settings. To\nbridge this gap, we introduce DSBench, a comprehensive benchmark designed to\nevaluate data science agents with realistic tasks. This benchmark includes 466\ndata analysis tasks and 74 data modeling tasks, sourced from Eloquence and\nKaggle competitions. DSBench offers a realistic setting by encompassing long\ncontexts, multimodal task backgrounds, reasoning with large data files and\nmulti-table structures, and performing end-to-end data modeling tasks. Our\nevaluation of state-of-the-art LLMs, LVLMs, and agents shows that they struggle\nwith most tasks, with the best agent solving only 34.12% of data analysis tasks\nand achieving a 34.74% Relative Performance Gap (RPG). These findings\nunderscore the need for further advancements in developing more practical,\nintelligent, and autonomous data science agents.\n","authors":["Liqiang Jing","Zhehui Huang","Xiaoyang Wang","Wenlin Yao","Wenhao Yu","Kaixin Ma","Hongming Zhang","Xinya Du","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2409.07703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07691v1","updated":"2024-09-12T01:51:06Z","published":"2024-09-12T01:51:06Z","title":"Enhancing Q&A Text Retrieval with Ranking Models: Benchmarking,\n fine-tuning and deploying Rerankers for RAG","summary":" Ranking models play a crucial role in enhancing overall accuracy of text\nretrieval systems. These multi-stage systems typically utilize either dense\nembedding models or sparse lexical indices to retrieve relevant passages based\non a given query, followed by ranking models that refine the ordering of the\ncandidate passages by its relevance to the query.\n This paper benchmarks various publicly available ranking models and examines\ntheir impact on ranking accuracy. We focus on text retrieval for\nquestion-answering tasks, a common use case for Retrieval-Augmented Generation\nsystems. Our evaluation benchmarks include models some of which are\ncommercially viable for industrial applications.\n We introduce a state-of-the-art ranking model, NV-RerankQA-Mistral-4B-v3,\nwhich achieves a significant accuracy increase of ~14% compared to pipelines\nwith other rerankers. We also provide an ablation study comparing the\nfine-tuning of ranking models with different sizes, losses and self-attention\nmechanisms.\n Finally, we discuss challenges of text retrieval pipelines with ranking\nmodels in real-world industry applications, in particular the trade-offs among\nmodel size, ranking accuracy and system requirements like indexing and serving\nlatency / throughput.\n","authors":["Gabriel de Souza P. Moreira","Ronay Ak","Benedikt Schifferer","Mengyao Xu","Radek Osmulski","Even Oldridge"],"pdf_url":"https://arxiv.org/pdf/2409.07691v1.pdf","comment":"Accepted for the 1st Workshop on GenAI and RAG Systems for Enterprise\n @ CIKM 2024"},{"id":"http://arxiv.org/abs/2409.05448v2","updated":"2024-09-12T01:32:25Z","published":"2024-09-09T09:04:56Z","title":"Representational Analysis of Binding in Large Language Models","summary":" Entity tracking is essential for complex reasoning. To perform in-context\nentity tracking, language models (LMs) must bind an entity to its attribute\n(e.g., bind a container to its content) to recall attribute for a given entity.\nFor example, given a context mentioning ``The coffee is in Box Z, the stone is\nin Box M, the map is in Box H'', to infer ``Box Z contains the coffee'' later,\nLMs must bind ``Box Z'' to ``coffee''. To explain the binding behaviour of LMs,\nFeng and Steinhardt (2023) introduce a Binding ID mechanism and state that LMs\nuse a abstract concept called Binding ID (BI) to internally mark\nentity-attribute pairs. However, they have not directly captured the BI\ndeterminant information from entity activations. In this work, we provide a\nnovel view of the Binding ID mechanism by localizing the prototype of BI\ninformation. Specifically, we discover that there exists a low-rank subspace in\nthe hidden state (or activation) of LMs, that primarily encodes the order of\nentity and attribute and which is used as the prototype of BI to causally\ndetermine the binding. To identify this subspace, we choose principle component\nanalysis as our first attempt and it is empirically proven to be effective.\nMoreover, we also discover that when editing representations along directions\nin the subspace, LMs tend to bind a given entity to other attributes\naccordingly. For example, by patching activations along the BI encoding\ndirection we can make the LM to infer ``Box Z contains the stone'' and ``Box Z\ncontains the map''.\n","authors":["Qin Dai","Benjamin Heinzerling","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2409.05448v2.pdf","comment":"The key phrase \"BI Subspace\" might be misleading, because it sounds\n like the subspace that directly encodes BI, and which is different with its\n intended meaning that the subspace that is the base (or prototype) of BI.\n Therefore, the naming of the subspace and its corresponding wording needs\n further discussion and review"},{"id":"http://arxiv.org/abs/2409.07672v1","updated":"2024-09-12T00:27:31Z","published":"2024-09-12T00:27:31Z","title":"An Unsupervised Dialogue Topic Segmentation Model Based on Utterance\n Rewriting","summary":" Dialogue topic segmentation plays a crucial role in various types of dialogue\nmodeling tasks. The state-of-the-art unsupervised DTS methods learn topic-aware\ndiscourse representations from conversation data through adjacent discourse\nmatching and pseudo segmentation to further mine useful clues in unlabeled\nconversational relations. However, in multi-round dialogs, discourses often\nhave co-references or omissions, leading to the fact that direct use of these\ndiscourses for representation learning may negatively affect the semantic\nsimilarity computation in the neighboring discourse matching task. In order to\nfully utilize the useful cues in conversational relations, this study proposes\na novel unsupervised dialog topic segmentation method that combines the\nUtterance Rewriting (UR) technique with an unsupervised learning algorithm to\nefficiently utilize the useful cues in unlabeled dialogs by rewriting the\ndialogs in order to recover the co-referents and omitted words. Compared with\nexisting unsupervised models, the proposed Discourse Rewriting Topic\nSegmentation Model (UR-DTS) significantly improves the accuracy of topic\nsegmentation. The main finding is that the performance on DialSeg711 improves\nby about 6% in terms of absolute error score and WD, achieving 11.42% in terms\nof absolute error score and 12.97% in terms of WD. on Doc2Dial the absolute\nerror score and WD improves by about 3% and 2%, respectively, resulting in SOTA\nreaching 35.17% in terms of absolute error score and 38.49% in terms of WD.\nThis shows that the model is very effective in capturing the nuances of\nconversational topics, as well as the usefulness and challenges of utilizing\nunlabeled conversations.\n","authors":["Xia Hou","Qifeng Li","Tongliang Li"],"pdf_url":"https://arxiv.org/pdf/2409.07672v1.pdf","comment":"in Chinese language"},{"id":"http://arxiv.org/abs/2406.06852v4","updated":"2024-09-12T00:27:06Z","published":"2024-06-10T23:54:21Z","title":"A Survey of Backdoor Attacks and Defenses on Large Language Models:\n Implications for Security Measures","summary":" Large Language Models (LLMs), which bridge the gap between human language\nunderstanding and complex problem-solving, achieve state-of-the-art performance\non several NLP tasks, particularly in few-shot and zero-shot settings. Despite\nthe demonstrable efficacy of LLMs, due to constraints on computational\nresources, users have to engage with open-source language models or outsource\nthe entire training process to third-party platforms. However, research has\ndemonstrated that language models are susceptible to potential security\nvulnerabilities, particularly in backdoor attacks. Backdoor attacks are\ndesigned to introduce targeted vulnerabilities into language models by\npoisoning training samples or model weights, allowing attackers to manipulate\nmodel responses through malicious triggers. While existing surveys on backdoor\nattacks provide a comprehensive overview, they lack an in-depth examination of\nbackdoor attacks specifically targeting LLMs. To bridge this gap and grasp the\nlatest trends in the field, this paper presents a novel perspective on backdoor\nattacks for LLMs by focusing on fine-tuning methods. Specifically, we\nsystematically classify backdoor attacks into three categories: full-parameter\nfine-tuning, parameter-efficient fine-tuning, and no fine-tuning Based on\ninsights from a substantial review, we also discuss crucial issues for future\nresearch on backdoor attacks, such as further exploring attack algorithms that\ndo not require fine-tuning, or developing more covert attack algorithms.\n","authors":["Shuai Zhao","Meihuizi Jia","Zhongliang Guo","Leilei Gan","Xiaoyu Xu","Xiaobao Wu","Jie Fu","Yichao Feng","Fengjun Pan","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2406.06852v4.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.15794v3","updated":"2024-09-12T23:35:41Z","published":"2024-07-22T16:52:32Z","title":"Disentangling spatio-temporal knowledge for weakly supervised object\n detection and segmentation in surgical video","summary":" Weakly supervised video object segmentation (WSVOS) enables the\nidentification of segmentation maps without requiring an extensive training\ndataset of object masks, relying instead on coarse video labels indicating\nobject presence. Current state-of-the-art methods either require multiple\nindependent stages of processing that employ motion cues or, in the case of\nend-to-end trainable networks, lack in segmentation accuracy, in part due to\nthe difficulty of learning segmentation maps from videos with transient object\npresence. This limits the application of WSVOS for semantic annotation of\nsurgical videos where multiple surgical tools frequently move in and out of the\nfield of view, a problem that is more difficult than typically encountered in\nWSVOS. This paper introduces Video Spatio-Temporal Disentanglement Networks\n(VDST-Net), a framework to disentangle spatiotemporal information using\nsemi-decoupled knowledge distillation to predict high-quality class activation\nmaps (CAMs). A teacher network designed to resolve temporal conflicts when\nspecifics about object location and timing in the video are not provided works\nwith a student network that integrates information over time by leveraging\ntemporal dependencies. We demonstrate the efficacy of our framework on a public\nreference dataset and on a more challenging surgical video dataset where\nobjects are, on average, present in less than 60\\% of annotated frames. Our\nmethod outperforms state-of-the-art techniques and generates superior\nsegmentation masks under video-level weak supervision.\n","authors":["Guiqiu Liao","Matjaz Jogan","Sai Koushik","Eric Eaton","Daniel A. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2407.15794v3.pdf","comment":"13 pages, 6 figures, 8 tables"},{"id":"http://arxiv.org/abs/2409.08397v1","updated":"2024-09-12T20:56:16Z","published":"2024-09-12T20:56:16Z","title":"360PanT: Training-Free Text-Driven 360-Degree Panorama-to-Panorama\n Translation","summary":" Preserving boundary continuity in the translation of 360-degree panoramas\nremains a significant challenge for existing text-driven image-to-image\ntranslation methods. These methods often produce visually jarring\ndiscontinuities at the translated panorama's boundaries, disrupting the\nimmersive experience. To address this issue, we propose 360PanT, a\ntraining-free approach to text-based 360-degree panorama-to-panorama\ntranslation with boundary continuity. Our 360PanT achieves seamless\ntranslations through two key components: boundary continuity encoding and\nseamless tiling translation with spatial control. Firstly, the boundary\ncontinuity encoding embeds critical boundary continuity information of the\ninput 360-degree panorama into the noisy latent representation by constructing\nan extended input image. Secondly, leveraging this embedded noisy latent\nrepresentation and guided by a target prompt, the seamless tiling translation\nwith spatial control enables the generation of a translated image with\nidentical left and right halves while adhering to the extended input's\nstructure and semantic layout. This process ensures a final translated\n360-degree panorama with seamless boundary continuity. Experimental results on\nboth real-world and synthesized datasets demonstrate the effectiveness of our\n360PanT in translating 360-degree panoramas. Code is available at\n\\href{https://github.com/littlewhitesea/360PanT}{https://github.com/littlewhitesea/360PanT}.\n","authors":["Hai Wang","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2409.08397v1.pdf","comment":"Accepted by WACV 2025, Project Page:\n \\href{https://littlewhitesea.github.io/360PanT.github.io/}{https://littlewhitesea.github.io/360PanT.github.io/}"},{"id":"http://arxiv.org/abs/2409.08388v1","updated":"2024-09-12T20:34:34Z","published":"2024-09-12T20:34:34Z","title":"Continual Learning in 3D Point Clouds: Employing Spectral Techniques for\n Exemplar Selection","summary":" We introduce a novel framework for Continual Learning in 3D object\nclassification (CL3D). Our approach is based on the selection of prototypes\nfrom each class using spectral clustering. For non-Euclidean data such as point\nclouds, spectral clustering can be employed as long as one can define a\ndistance measure between pairs of samples. Choosing the appropriate distance\nmeasure enables us to leverage 3D geometric characteristics to identify\nrepresentative prototypes for each class. We explore the effectiveness of\nclustering in the input space (3D points), local feature space\n(1024-dimensional points), and global feature space. We conduct experiments on\nthe ModelNet40, ShapeNet, and ScanNet datasets, achieving state-of-the-art\naccuracy exclusively through the use of input space features. By leveraging the\ncombined input, local, and global features, we have improved the\nstate-of-the-art on ModelNet and ShapeNet, utilizing nearly half the memory\nused by competing approaches. For the challenging ScanNet dataset, our method\nenhances accuracy by 4.1% while consuming just 28% of the memory used by our\ncompetitors, demonstrating the scalability of our approach.\n","authors":["Hossein Resani","Behrooz Nasihatkon","Mohammadreza Alimoradi Jazi"],"pdf_url":"https://arxiv.org/pdf/2409.08388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08381v1","updated":"2024-09-12T20:02:51Z","published":"2024-09-12T20:02:51Z","title":"Rethinking Prompting Strategies for Multi-Label Recognition with Partial\n Annotations","summary":" Vision-language models (VLMs) like CLIP have been adapted for Multi-Label\nRecognition (MLR) with partial annotations by leveraging prompt-learning, where\npositive and negative prompts are learned for each class to associate their\nembeddings with class presence or absence in the shared vision-text feature\nspace. While this approach improves MLR performance by relying on VLM priors,\nwe hypothesize that learning negative prompts may be suboptimal, as the\ndatasets used to train VLMs lack image-caption pairs explicitly focusing on\nclass absence. To analyze the impact of positive and negative prompt learning\non MLR, we introduce PositiveCoOp and NegativeCoOp, where only one prompt is\nlearned with VLM guidance while the other is replaced by an embedding vector\nlearned directly in the shared feature space without relying on the text\nencoder. Through empirical analysis, we observe that negative prompts degrade\nMLR performance, and learning only positive prompts, combined with learned\nnegative embeddings (PositiveCoOp), outperforms dual prompt learning\napproaches. Moreover, we quantify the performance benefits that prompt-learning\noffers over a simple vision-features-only baseline, observing that the baseline\ndisplays strong performance comparable to dual prompt learning approach\n(DualCoOp), when the proportion of missing labels is low, while requiring half\nthe training compute and 16 times fewer parameters\n","authors":["Samyak Rawlekar","Shubhang Bhatnagar","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2409.08381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08376v1","updated":"2024-09-12T19:57:44Z","published":"2024-09-12T19:57:44Z","title":"Learned Compression for Images and Point Clouds","summary":" Over the last decade, deep learning has shown great success at performing\ncomputer vision tasks, including classification, super-resolution, and style\ntransfer. Now, we apply it to data compression to help build the next\ngeneration of multimedia codecs. This thesis provides three primary\ncontributions to this new field of learned compression. First, we present an\nefficient low-complexity entropy model that dynamically adapts the encoding\ndistribution to a specific input by compressing and transmitting the encoding\ndistribution itself as side information. Secondly, we propose a novel\nlightweight low-complexity point cloud codec that is highly specialized for\nclassification, attaining significant reductions in bitrate compared to\nnon-specialized codecs. Lastly, we explore how motion within the input domain\nbetween consecutive video frames is manifested in the corresponding\nconvolutionally-derived latent space.\n","authors":["Mateen Ulhaq"],"pdf_url":"https://arxiv.org/pdf/2409.08376v1.pdf","comment":"65 pages, 21 figures, Master's Thesis, defended in 2023"},{"id":"http://arxiv.org/abs/2406.11445v4","updated":"2024-09-12T19:36:06Z","published":"2024-06-17T11:57:14Z","title":"Solving the Inverse Problem of Electrocardiography for Cardiac Digital\n Twins: A Survey","summary":" Cardiac digital twins (CDTs) are personalized virtual representations used to\nunderstand complex cardiac mechanisms. A critical component of CDT development\nis solving the ECG inverse problem, which enables the reconstruction of cardiac\nsources and the estimation of patient-specific electrophysiology (EP)\nparameters from surface ECG data. Despite challenges from complex cardiac\nanatomy, noisy ECG data, and the ill-posed nature of the inverse problem,\nrecent advances in computational methods have greatly improved the accuracy and\nefficiency of ECG inverse inference, strengthening the fidelity of CDTs. This\npaper aims to provide a comprehensive review of the methods of solving ECG\ninverse problem, the validation strategies, the clinical applications, and\nfuture perspectives. For the methodologies, we broadly classify\nstate-of-the-art approaches into two categories: deterministic and\nprobabilistic methods, including both conventional and deep learning-based\ntechniques. Integrating physics laws with deep learning models holds promise,\nbut challenges such as capturing dynamic electrophysiology accurately,\naccessing accurate domain knowledge, and quantifying prediction uncertainty\npersist. Integrating models into clinical workflows while ensuring\ninterpretability and usability for healthcare professionals is essential.\nOvercoming these challenges will drive further research in CDTs.\n","authors":["Lei Li","Julia Camps","Blanca Rodriguez","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2406.11445v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17744v3","updated":"2024-09-12T19:35:39Z","published":"2023-11-29T15:49:31Z","title":"Variational Bayes image restoration with compressive autoencoders","summary":" Regularization of inverse problems is of paramount importance in\ncomputational imaging. The ability of neural networks to learn efficient image\nrepresentations has been recently exploited to design powerful data-driven\nregularizers. While state-of-the-art plug-and-play methods rely on an implicit\nregularization provided by neural denoisers, alternative Bayesian approaches\nconsider Maximum A Posteriori (MAP) estimation in the latent space of a\ngenerative model, thus with an explicit regularization. However,\nstate-of-the-art deep generative models require a huge amount of training data\ncompared to denoisers. Besides, their complexity hampers the optimization\ninvolved in latent MAP derivation. In this work, we first propose to use\ncompressive autoencoders instead. These networks, which can be seen as\nvariational autoencoders with a flexible latent prior, are smaller and easier\nto train than state-of-the-art generative models. As a second contribution, we\nintroduce the Variational Bayes Latent Estimation (VBLE) algorithm, which\nperforms latent estimation within the framework of variational inference.\nThanks to a simple yet efficient parameterization of the variational posterior,\nVBLE allows for fast and easy (approximate) posterior sampling.Experimental\nresults on image datasets BSD and FFHQ demonstrate that VBLE reaches similar\nperformance than state-of-the-art plug-and-play methods, while being able to\nquantify uncertainties significantly faster than other existing posterior\nsampling techniques.\n","authors":["Maud Biquard","Marie Chabert","Florence Genin","Christophe Latry","Thomas Oberlin"],"pdf_url":"https://arxiv.org/pdf/2311.17744v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08061v4","updated":"2024-09-12T18:35:41Z","published":"2024-07-10T21:51:50Z","title":"Geospecific View Generation -- Geometry-Context Aware High-resolution\n Ground View Inference from Satellite Views","summary":" Predicting realistic ground views from satellite imagery in urban scenes is a\nchallenging task due to the significant view gaps between satellite and\nground-view images. We propose a novel pipeline to tackle this challenge, by\ngenerating geospecifc views that maximally respect the weak geometry and\ntexture from multi-view satellite images. Different from existing approaches\nthat hallucinate images from cues such as partial semantics or geometry from\noverhead satellite images, our method directly predicts ground-view images at\ngeolocation by using a comprehensive set of information from the satellite\nimage, resulting in ground-level images with a resolution boost at a factor of\nten or more. We leverage a novel building refinement method to reduce geometric\ndistortions in satellite data at ground level, which ensures the creation of\naccurate conditions for view synthesis using diffusion networks. Moreover, we\nproposed a novel geospecific prior, which prompts distribution learning of\ndiffusion models to respect image samples that are closer to the geolocation of\nthe predicted images. We demonstrate our pipeline is the first to generate\nclose-to-real and geospecific ground views merely based on satellite images.\n","authors":["Ningli Xu","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2407.08061v4.pdf","comment":"11 figures"},{"id":"http://arxiv.org/abs/2409.08353v1","updated":"2024-09-12T18:33:13Z","published":"2024-09-12T18:33:13Z","title":"Robust Dual Gaussian Splatting for Immersive Human-centric Volumetric\n Videos","summary":" Volumetric video represents a transformative advancement in visual media,\nenabling users to freely navigate immersive virtual experiences and narrowing\nthe gap between digital and real worlds. However, the need for extensive manual\nintervention to stabilize mesh sequences and the generation of excessively\nlarge assets in existing workflows impedes broader adoption. In this paper, we\npresent a novel Gaussian-based approach, dubbed \\textit{DualGS}, for real-time\nand high-fidelity playback of complex human performance with excellent\ncompression ratios. Our key idea in DualGS is to separately represent motion\nand appearance using the corresponding skin and joint Gaussians. Such an\nexplicit disentanglement can significantly reduce motion redundancy and enhance\ntemporal coherence. We begin by initializing the DualGS and anchoring skin\nGaussians to joint Gaussians at the first frame. Subsequently, we employ a\ncoarse-to-fine training strategy for frame-by-frame human performance modeling.\nIt includes a coarse alignment phase for overall motion prediction as well as a\nfine-grained optimization for robust tracking and high-fidelity rendering. To\nintegrate volumetric video seamlessly into VR environments, we efficiently\ncompress motion using entropy encoding and appearance using codec compression\ncoupled with a persistent codebook. Our approach achieves a compression ratio\nof up to 120 times, only requiring approximately 350KB of storage per frame. We\ndemonstrate the efficacy of our representation through photo-realistic,\nfree-view experiences on VR headsets, enabling users to immersively watch\nmusicians in performance and feel the rhythm of the notes at the performers'\nfingertips.\n","authors":["Yuheng Jiang","Zhehao Shen","Yu Hong","Chengcheng Guo","Yize Wu","Yingliang Zhang","Jingyi Yu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2409.08353v1.pdf","comment":"Accepted at SIGGRAPH Asia 2024. Project page:\n https://nowheretrix.github.io/DualGS/"},{"id":"http://arxiv.org/abs/2409.08351v1","updated":"2024-09-12T18:30:41Z","published":"2024-09-12T18:30:41Z","title":"Bayesian Inverse Graphics for Few-Shot Concept Learning","summary":" Humans excel at building generalizations of new concepts from just one single\nexample. Contrary to this, current computer vision models typically require\nlarge amount of training samples to achieve a comparable accuracy. In this work\nwe present a Bayesian model of perception that learns using only minimal data,\na prototypical probabilistic program of an object. Specifically, we propose a\ngenerative inverse graphics model of primitive shapes, to infer posterior\ndistributions over physically consistent parameters from one or several images.\nWe show how this representation can be used for downstream tasks such as\nfew-shot classification and pose estimation. Our model outperforms existing\nfew-shot neural-only classification algorithms and demonstrates generalization\nacross varying lighting conditions, backgrounds, and out-of-distribution\nshapes. By design, our model is uncertainty-aware and uses our new\ndifferentiable renderer for optimizing global scene parameters through gradient\ndescent, sampling posterior distributions over object parameters with Markov\nChain Monte Carlo (MCMC), and using a neural based likelihood function.\n","authors":["Octavio Arriaga","Jichen Guo","Rebecca Adam","Sebastian Houben","Frank Kirchner"],"pdf_url":"https://arxiv.org/pdf/2409.08351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06442v2","updated":"2024-09-12T18:22:51Z","published":"2024-09-10T11:48:05Z","title":"Prompt2Fashion: An automatically generated fashion dataset","summary":" Despite the rapid evolution and increasing efficacy of language and vision\ngenerative models, there remains a lack of comprehensive datasets that bridge\nthe gap between personalized fashion needs and AI-driven design, limiting the\npotential for truly inclusive and customized fashion solutions. In this work,\nwe leverage generative models to automatically construct a fashion image\ndataset tailored to various occasions, styles, and body types as instructed by\nusers. We use different Large Language Models (LLMs) and prompting strategies\nto offer personalized outfits of high aesthetic quality, detail, and relevance\nto both expert and non-expert users' requirements, as demonstrated by\nqualitative analysis. Up until now the evaluation of the generated outfits has\nbeen conducted by non-expert human subjects. Despite the provided fine-grained\ninsights on the quality and relevance of generation, we extend the discussion\non the importance of expert knowledge for the evaluation of artistic\nAI-generated datasets such as this one. Our dataset is publicly available on\nGitHub at https://github.com/georgiarg/Prompt2Fashion.\n","authors":["Georgia Argyrou","Angeliki Dimitriou","Maria Lymperaiou","Giorgos Filandrianos","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2409.06442v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08345v1","updated":"2024-09-12T18:18:02Z","published":"2024-09-12T18:18:02Z","title":"SIG: A Synthetic Identity Generation Pipeline for Generating Evaluation\n Datasets for Face Recognition","summary":" As Artificial Intelligence applications expand, the evaluation of models\nfaces heightened scrutiny. Ensuring public readiness requires evaluation\ndatasets, which differ from training data by being disjoint and ethically\nsourced in compliance with privacy regulations. The performance and fairness of\nface recognition systems depend significantly on the quality and\nrepresentativeness of these evaluation datasets. This data is sometimes scraped\nfrom the internet without user's consent, causing ethical concerns that can\nprohibit its use without proper releases. In rare cases, data is collected in a\ncontrolled environment with consent, however, this process is time-consuming,\nexpensive, and logistically difficult to execute. This creates a barrier for\nthose unable to conjure the immense resources required to gather ethically\nsourced evaluation datasets. To address these challenges, we introduce the\nSynthetic Identity Generation pipeline, or SIG, that allows for the targeted\ncreation of ethical, balanced datasets for face recognition evaluation. Our\nproposed and demonstrated pipeline generates high-quality images of synthetic\nidentities with controllable pose, facial features, and demographic attributes,\nsuch as race, gender, and age. We also release an open-source evaluation\ndataset named ControlFace10k, consisting of 10,008 face images of 3,336 unique\nsynthetic identities balanced across race, gender, and age, generated using the\nproposed SIG pipeline. We analyze ControlFace10k along with a non-synthetic\nBUPT dataset using state-of-the-art face recognition algorithms to demonstrate\nits effectiveness as an evaluation tool. This analysis highlights the dataset's\ncharacteristics and its utility in assessing algorithmic bias across different\ndemographic groups.\n","authors":["Kassi Nzalasse","Rishav Raj","Eli Laird","Corey Clark"],"pdf_url":"https://arxiv.org/pdf/2409.08345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08331v1","updated":"2024-09-12T18:00:25Z","published":"2024-09-12T18:00:25Z","title":"Digital Volumetric Biopsy Cores Improve Gleason Grading of Prostate\n Cancer Using Deep Learning","summary":" Prostate cancer (PCa) was the most frequently diagnosed cancer among American\nmen in 2023. The histological grading of biopsies is essential for diagnosis,\nand various deep learning-based solutions have been developed to assist with\nthis task. Existing deep learning frameworks are typically applied to\nindividual 2D cross-sections sliced from 3D biopsy tissue specimens. This\nprocess impedes the analysis of complex tissue structures such as glands, which\ncan vary depending on the tissue slice examined. We propose a novel digital\npathology data source called a \"volumetric core,\" obtained via the extraction\nand co-alignment of serially sectioned tissue sections using a novel\nmorphology-preserving alignment framework. We trained an attention-based\nmultiple-instance learning (ABMIL) framework on deep features extracted from\nvolumetric patches to automatically classify the Gleason Grade Group (GGG). To\nhandle volumetric patches, we used a modified video transformer with a deep\nfeature extractor pretrained using self-supervised learning. We ran our\nmorphology-preserving alignment framework to construct 10,210 volumetric cores,\nleaving out 30% for pretraining. The rest of the dataset was used to train\nABMIL, which resulted in a 0.958 macro-average AUC, 0.671 F1 score, 0.661\nprecision, and 0.695 recall averaged across all five GGG significantly\noutperforming the 2D baselines.\n","authors":["Ekaterina Redekop","Mara Pleasure","Zichen Wang","Anthony Sisk","Yang Zong","Kimberly Flores","William Speier","Corey W. Arnold"],"pdf_url":"https://arxiv.org/pdf/2409.08331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08278v1","updated":"2024-09-12T17:59:49Z","published":"2024-09-12T17:59:49Z","title":"DreamHOI: Subject-Driven Generation of 3D Human-Object Interactions with\n Diffusion Priors","summary":" We present DreamHOI, a novel method for zero-shot synthesis of human-object\ninteractions (HOIs), enabling a 3D human model to realistically interact with\nany given object based on a textual description. This task is complicated by\nthe varying categories and geometries of real-world objects and the scarcity of\ndatasets encompassing diverse HOIs. To circumvent the need for extensive data,\nwe leverage text-to-image diffusion models trained on billions of image-caption\npairs. We optimize the articulation of a skinned human mesh using Score\nDistillation Sampling (SDS) gradients obtained from these models, which predict\nimage-space edits. However, directly backpropagating image-space gradients into\ncomplex articulation parameters is ineffective due to the local nature of such\ngradients. To overcome this, we introduce a dual implicit-explicit\nrepresentation of a skinned mesh, combining (implicit) neural radiance fields\n(NeRFs) with (explicit) skeleton-driven mesh articulation. During optimization,\nwe transition between implicit and explicit forms, grounding the NeRF\ngeneration while refining the mesh articulation. We validate our approach\nthrough extensive experiments, demonstrating its effectiveness in generating\nrealistic HOIs.\n","authors":["Thomas Hanwen Zhu","Ruining Li","Tomas Jakab"],"pdf_url":"https://arxiv.org/pdf/2409.08278v1.pdf","comment":"Project page: https://DreamHOI.github.io/"},{"id":"http://arxiv.org/abs/2409.08277v1","updated":"2024-09-12T17:59:46Z","published":"2024-09-12T17:59:46Z","title":"Depth on Demand: Streaming Dense Depth from a Low Frame Rate Active\n Sensor","summary":" High frame rate and accurate depth estimation plays an important role in\nseveral tasks crucial to robotics and automotive perception. To date, this can\nbe achieved through ToF and LiDAR devices for indoor and outdoor applications,\nrespectively. However, their applicability is limited by low frame rate, energy\nconsumption, and spatial sparsity. Depth on Demand (DoD) allows for accurate\ntemporal and spatial depth densification achieved by exploiting a high frame\nrate RGB sensor coupled with a potentially lower frame rate and sparse active\ndepth sensor. Our proposal jointly enables lower energy consumption and denser\nshape reconstruction, by significantly reducing the streaming requirements on\nthe depth sensor thanks to its three core stages: i) multi-modal encoding, ii)\niterative multi-modal integration, and iii) depth decoding. We present extended\nevidence assessing the effectiveness of DoD on indoor and outdoor video\ndatasets, covering both environment scanning and automotive perception use\ncases.\n","authors":["Andrea Conti","Matteo Poggi","Valerio Cambareri","Stefano Mattoccia"],"pdf_url":"https://arxiv.org/pdf/2409.08277v1.pdf","comment":"Accepted for publication at the European Conference on Computer\n Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2409.08849v1","updated":"2024-09-12T17:59:08Z","published":"2024-09-12T17:59:08Z","title":"DeCLIP: Decoding CLIP representations for deepfake localization","summary":" Generative models can create entirely new images, but they can also partially\nmodify real images in ways that are undetectable to the human eye. In this\npaper, we address the challenge of automatically detecting such local\nmanipulations. One of the most pressing problems in deepfake detection remains\nthe ability of models to generalize to different classes of generators. In the\ncase of fully manipulated images, representations extracted from large\nself-supervised models (such as CLIP) provide a promising direction towards\nmore robust detectors. Here, we introduce DeCLIP, a first attempt to leverage\nsuch large pretrained features for detecting local manipulations. We show that,\nwhen combined with a reasonably large convolutional decoder, pretrained\nself-supervised representations are able to perform localization and improve\ngeneralization capabilities over existing methods. Unlike previous work, our\napproach is able to perform localization on the challenging case of latent\ndiffusion models, where the entire image is affected by the fingerprint of the\ngenerator. Moreover, we observe that this type of data, which combines local\nsemantic information with a global fingerprint, provides more stable\ngeneralization than other categories of generative methods.\n","authors":["Stefan Smeu","Elisabeta Oneata","Dan Oneata"],"pdf_url":"https://arxiv.org/pdf/2409.08849v1.pdf","comment":"Accepted at Winter Conference on Applications of Computer Vision\n (WACV) 2025"},{"id":"http://arxiv.org/abs/2409.08273v1","updated":"2024-09-12T17:59:07Z","published":"2024-09-12T17:59:07Z","title":"Hand-Object Interaction Pretraining from Videos","summary":" We present an approach to learn general robot manipulation priors from 3D\nhand-object interaction trajectories. We build a framework to use in-the-wild\nvideos to generate sensorimotor robot trajectories. We do so by lifting both\nthe human hand and the manipulated object in a shared 3D space and retargeting\nhuman motions to robot actions. Generative modeling on this data gives us a\ntask-agnostic base policy. This policy captures a general yet flexible\nmanipulation prior. We empirically demonstrate that finetuning this policy,\nwith both reinforcement learning (RL) and behavior cloning (BC), enables\nsample-efficient adaptation to downstream tasks and simultaneously improves\nrobustness and generalizability compared to prior approaches. Qualitative\nexperiments are available at: \\url{https://hgaurav2k.github.io/hop/}.\n","authors":["Himanshu Gaurav Singh","Antonio Loquercio","Carmelo Sferrazza","Jane Wu","Haozhi Qi","Pieter Abbeel","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2409.08273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08272v1","updated":"2024-09-12T17:59:04Z","published":"2024-09-12T17:59:04Z","title":"Click2Mask: Local Editing with Dynamic Mask Generation","summary":" Recent advancements in generative models have revolutionized image generation\nand editing, making these tasks accessible to non-experts. This paper focuses\non local image editing, particularly the task of adding new content to a\nloosely specified area. Existing methods often require a precise mask or a\ndetailed description of the location, which can be cumbersome and prone to\nerrors. We propose Click2Mask, a novel approach that simplifies the local\nediting process by requiring only a single point of reference (in addition to\nthe content description). A mask is dynamically grown around this point during\na Blended Latent Diffusion (BLD) process, guided by a masked CLIP-based\nsemantic loss. Click2Mask surpasses the limitations of segmentation-based and\nfine-tuning dependent methods, offering a more user-friendly and contextually\naccurate solution. Our experiments demonstrate that Click2Mask not only\nminimizes user effort but also delivers competitive or superior local image\nmanipulation results compared to SoTA methods, according to both human\njudgement and automatic metrics. Key contributions include the simplification\nof user input, the ability to freely add objects unconstrained by existing\nsegments, and the integration potential of our dynamic mask approach within\nother editing methods.\n","authors":["Omer Regev","Omri Avrahami","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2409.08272v1.pdf","comment":"Project page is available at https://omeregev.github.io/click2mask/"},{"id":"http://arxiv.org/abs/2409.08271v1","updated":"2024-09-12T17:58:31Z","published":"2024-09-12T17:58:31Z","title":"DreamBeast: Distilling 3D Fantastical Animals with Part-Aware Knowledge\n Transfer","summary":" We present DreamBeast, a novel method based on score distillation sampling\n(SDS) for generating fantastical 3D animal assets composed of distinct parts.\nExisting SDS methods often struggle with this generation task due to a limited\nunderstanding of part-level semantics in text-to-image diffusion models. While\nrecent diffusion models, such as Stable Diffusion 3, demonstrate a better\npart-level understanding, they are prohibitively slow and exhibit other common\nproblems associated with single-view diffusion models. DreamBeast overcomes\nthis limitation through a novel part-aware knowledge transfer mechanism. For\neach generated asset, we efficiently extract part-level knowledge from the\nStable Diffusion 3 model into a 3D Part-Affinity implicit representation. This\nenables us to instantly generate Part-Affinity maps from arbitrary camera\nviews, which we then use to modulate the guidance of a multi-view diffusion\nmodel during SDS to create 3D assets of fantastical animals. DreamBeast\nsignificantly enhances the quality of generated 3D creatures with\nuser-specified part compositions while reducing computational overhead, as\ndemonstrated by extensive quantitative and qualitative evaluations.\n","authors":["Runjia Li","Junlin Han","Luke Melas-Kyriazi","Chunyi Sun","Zhaochong An","Zhongrui Gui","Shuyang Sun","Philip Torr","Tomas Jakab"],"pdf_url":"https://arxiv.org/pdf/2409.08271v1.pdf","comment":"Project page: https://dreambeast3d.github.io/, code:\n https://github.com/runjiali-rl/threestudio-dreambeast"},{"id":"http://arxiv.org/abs/2409.08270v1","updated":"2024-09-12T17:58:13Z","published":"2024-09-12T17:58:13Z","title":"FlashSplat: 2D to 3D Gaussian Splatting Segmentation Solved Optimally","summary":" This study addresses the challenge of accurately segmenting 3D Gaussian\nSplatting from 2D masks. Conventional methods often rely on iterative gradient\ndescent to assign each Gaussian a unique label, leading to lengthy optimization\nand sub-optimal solutions. Instead, we propose a straightforward yet globally\noptimal solver for 3D-GS segmentation. The core insight of our method is that,\nwith a reconstructed 3D-GS scene, the rendering of the 2D masks is essentially\na linear function with respect to the labels of each Gaussian. As such, the\noptimal label assignment can be solved via linear programming in closed form.\nThis solution capitalizes on the alpha blending characteristic of the splatting\nprocess for single step optimization. By incorporating the background bias in\nour objective function, our method shows superior robustness in 3D segmentation\nagainst noises. Remarkably, our optimization completes within 30 seconds, about\n50$\\times$ faster than the best existing methods. Extensive experiments\ndemonstrate the efficiency and robustness of our method in segmenting various\nscenes, and its superior performance in downstream tasks such as object removal\nand inpainting. Demos and code will be available at\nhttps://github.com/florinshen/FlashSplat.\n","authors":["Qiuhong Shen","Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08270v1.pdf","comment":"ECCV'2024"},{"id":"http://arxiv.org/abs/2407.02403v2","updated":"2024-09-12T17:57:52Z","published":"2024-07-02T16:21:44Z","title":"Face Reconstruction Transfer Attack as Out-of-Distribution\n Generalization","summary":" Understanding the vulnerability of face recognition systems to malicious\nattacks is of critical importance. Previous works have focused on\nreconstructing face images that can penetrate a targeted verification system.\nEven in the white-box scenario, however, naively reconstructed images\nmisrepresent the identity information, hence the attacks are easily neutralized\nonce the face system is updated or changed. In this paper, we aim to\nreconstruct face images which are capable of transferring face attacks on\nunseen encoders. We term this problem as Face Reconstruction Transfer Attack\n(FRTA) and show that it can be formulated as an out-of-distribution (OOD)\ngeneralization problem. Inspired by its OOD nature, we propose to solve FRTA by\nAveraged Latent Search and Unsupervised Validation with pseudo target (ALSUV).\nTo strengthen the reconstruction attack on OOD unseen encoders, ALSUV\nreconstructs the face by searching the latent of amortized generator StyleGAN2\nthrough multiple latent optimization, latent optimization trajectory averaging,\nand unsupervised validation with a pseudo target. We demonstrate the efficacy\nand generalization of our method on widely used face datasets, accompanying it\nwith extensive ablation studies and visually, qualitatively, and quantitatively\nanalyses. The source code will be released.\n","authors":["Yoon Gyo Jung","Jaewoo Park","Xingbo Dong","Hojin Park","Andrew Beng Jin Teoh","Octavia Camps"],"pdf_url":"https://arxiv.org/pdf/2407.02403v2.pdf","comment":"Accepted to ECCV2024"},{"id":"http://arxiv.org/abs/2409.08260v1","updated":"2024-09-12T17:55:37Z","published":"2024-09-12T17:55:37Z","title":"Improving Text-guided Object Inpainting with Semantic Pre-inpainting","summary":" Recent years have witnessed the success of large text-to-image diffusion\nmodels and their remarkable potential to generate high-quality images. The\nfurther pursuit of enhancing the editability of images has sparked significant\ninterest in the downstream task of inpainting a novel object described by a\ntext prompt within a designated region in the image. Nevertheless, the problem\nis not trivial from two aspects: 1) Solely relying on one single U-Net to align\ntext prompt and visual object across all the denoising timesteps is\ninsufficient to generate desired objects; 2) The controllability of object\ngeneration is not guaranteed in the intricate sampling space of diffusion\nmodel. In this paper, we propose to decompose the typical single-stage object\ninpainting into two cascaded processes: 1) semantic pre-inpainting that infers\nthe semantic features of desired objects in a multi-modal feature space; 2)\nhigh-fieldity object generation in diffusion latent space that pivots on such\ninpainted semantic features. To achieve this, we cascade a Transformer-based\nsemantic inpainter and an object inpainting diffusion model, leading to a novel\nCAscaded Transformer-Diffusion (CAT-Diffusion) framework for text-guided object\ninpainting. Technically, the semantic inpainter is trained to predict the\nsemantic features of the target object conditioning on unmasked context and\ntext prompt. The outputs of the semantic inpainter then act as the informative\nvisual prompts to guide high-fieldity object generation through a reference\nadapter layer, leading to controllable object inpainting. Extensive evaluations\non OpenImages-V6 and MSCOCO validate the superiority of CAT-Diffusion against\nthe state-of-the-art methods. Code is available at\n\\url{https://github.com/Nnn-s/CATdiffusion}.\n","authors":["Yifu Chen","Jingwen Chen","Yingwei Pan","Yehao Li","Ting Yao","Zhineng Chen","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2409.08260v1.pdf","comment":"ECCV 2024. Source code is available at\n https://github.com/Nnn-s/CATdiffusion"},{"id":"http://arxiv.org/abs/2409.08258v1","updated":"2024-09-12T17:55:11Z","published":"2024-09-12T17:55:11Z","title":"Improving Virtual Try-On with Garment-focused Diffusion Models","summary":" Diffusion models have led to the revolutionizing of generative modeling in\nnumerous image synthesis tasks. Nevertheless, it is not trivial to directly\napply diffusion models for synthesizing an image of a target person wearing a\ngiven in-shop garment, i.e., image-based virtual try-on (VTON) task. The\ndifficulty originates from the aspect that the diffusion process should not\nonly produce holistically high-fidelity photorealistic image of the target\nperson, but also locally preserve every appearance and texture detail of the\ngiven garment. To address this, we shape a new Diffusion model, namely GarDiff,\nwhich triggers the garment-focused diffusion process with amplified guidance of\nboth basic visual appearance and detailed textures (i.e., high-frequency\ndetails) derived from the given garment. GarDiff first remoulds a pre-trained\nlatent diffusion model with additional appearance priors derived from the CLIP\nand VAE encodings of the reference garment. Meanwhile, a novel garment-focused\nadapter is integrated into the UNet of diffusion model, pursuing local\nfine-grained alignment with the visual appearance of reference garment and\nhuman pose. We specifically design an appearance loss over the synthesized\ngarment to enhance the crucial, high-frequency details. Extensive experiments\non VITON-HD and DressCode datasets demonstrate the superiority of our GarDiff\nwhen compared to state-of-the-art VTON approaches. Code is publicly available\nat:\n\\href{https://github.com/siqi0905/GarDiff/tree/master}{https://github.com/siqi0905/GarDiff/tree/master}.\n","authors":["Siqi Wan","Yehao Li","Jingwen Chen","Yingwei Pan","Ting Yao","Yang Cao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2409.08258v1.pdf","comment":"ECCV 2024. Source code is available at\n https://github.com/siqi0905/GarDiff/tree/master"},{"id":"http://arxiv.org/abs/2409.08251v1","updated":"2024-09-12T17:48:22Z","published":"2024-09-12T17:48:22Z","title":"Dynamic Prompting of Frozen Text-to-Image Diffusion Models for Panoptic\n Narrative Grounding","summary":" Panoptic narrative grounding (PNG), whose core target is fine-grained\nimage-text alignment, requires a panoptic segmentation of referred objects\ngiven a narrative caption. Previous discriminative methods achieve only weak or\ncoarse-grained alignment by panoptic segmentation pretraining or CLIP model\nadaptation. Given the recent progress of text-to-image Diffusion models,\nseveral works have shown their capability to achieve fine-grained image-text\nalignment through cross-attention maps and improved general segmentation\nperformance. However, the direct use of phrase features as static prompts to\napply frozen Diffusion models to the PNG task still suffers from a large task\ngap and insufficient vision-language interaction, yielding inferior\nperformance. Therefore, we propose an Extractive-Injective Phrase Adapter\n(EIPA) bypass within the Diffusion UNet to dynamically update phrase prompts\nwith image features and inject the multimodal cues back, which leverages the\nfine-grained image-text alignment capability of Diffusion models more\nsufficiently. In addition, we also design a Multi-Level Mutual Aggregation\n(MLMA) module to reciprocally fuse multi-level image and phrase features for\nsegmentation refinement. Extensive experiments on the PNG benchmark show that\nour method achieves new state-of-the-art performance.\n","authors":["Hongyu Li","Tianrui Hui","Zihan Ding","Jing Zhang","Bin Ma","Xiaoming Wei","Jizhong Han","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2409.08251v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2409.08248v1","updated":"2024-09-12T17:47:51Z","published":"2024-09-12T17:47:51Z","title":"TextBoost: Towards One-Shot Personalization of Text-to-Image Models via\n Fine-tuning Text Encoder","summary":" Recent breakthroughs in text-to-image models have opened up promising\nresearch avenues in personalized image generation, enabling users to create\ndiverse images of a specific subject using natural language prompts. However,\nexisting methods often suffer from performance degradation when given only a\nsingle reference image. They tend to overfit the input, producing highly\nsimilar outputs regardless of the text prompt. This paper addresses the\nchallenge of one-shot personalization by mitigating overfitting, enabling the\ncreation of controllable images through text prompts. Specifically, we propose\na selective fine-tuning strategy that focuses on the text encoder. Furthermore,\nwe introduce three key techniques to enhance personalization performance: (1)\naugmentation tokens to encourage feature disentanglement and alleviate\noverfitting, (2) a knowledge-preservation loss to reduce language drift and\npromote generalizability across diverse prompts, and (3) SNR-weighted sampling\nfor efficient training. Extensive experiments demonstrate that our approach\nefficiently generates high-quality, diverse images using only a single\nreference image while significantly reducing memory and storage requirements.\n","authors":["NaHyeon Park","Kunhee Kim","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2409.08248v1.pdf","comment":"Project page: https://textboost.github.io"},{"id":"http://arxiv.org/abs/2409.08245v1","updated":"2024-09-12T17:44:07Z","published":"2024-09-12T17:44:07Z","title":"Style Based Clustering of Visual Artworks","summary":" Clustering artworks based on style has many potential real-world applications\nlike art recommendations, style-based search and retrieval, and the study of\nartistic style evolution in an artwork corpus. However, clustering artworks\nbased on style is largely an unaddressed problem. A few present methods for\nclustering artworks principally rely on generic image feature representations\nderived from deep neural networks and do not specifically deal with the\nartistic style. In this paper, we introduce and deliberate over the notion of\nstyle-based clustering of visual artworks. Our main objective is to explore\nneural feature representations and architectures that can be used for\nstyle-based clustering and observe their impact and effectiveness. We develop\ndifferent methods and assess their relative efficacy for style-based clustering\nthrough qualitative and quantitative analysis by applying them to four artwork\ncorpora and four curated synthetically styled datasets. Our analysis provides\nsome key novel insights on architectures, feature representations, and\nevaluation methods suitable for style-based clustering.\n","authors":["Abhishek Dangeti","Pavan Gajula","Vivek Srivastava","Vikram Jamwal"],"pdf_url":"https://arxiv.org/pdf/2409.08245v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2409.08240v1","updated":"2024-09-12T17:39:23Z","published":"2024-09-12T17:39:23Z","title":"IFAdapter: Instance Feature Control for Grounded Text-to-Image\n Generation","summary":" While Text-to-Image (T2I) diffusion models excel at generating visually\nappealing images of individual instances, they struggle to accurately position\nand control the features generation of multiple instances. The Layout-to-Image\n(L2I) task was introduced to address the positioning challenges by\nincorporating bounding boxes as spatial control signals, but it still falls\nshort in generating precise instance features. In response, we propose the\nInstance Feature Generation (IFG) task, which aims to ensure both positional\naccuracy and feature fidelity in generated instances. To address the IFG task,\nwe introduce the Instance Feature Adapter (IFAdapter). The IFAdapter enhances\nfeature depiction by incorporating additional appearance tokens and utilizing\nan Instance Semantic Map to align instance-level features with spatial\nlocations. The IFAdapter guides the diffusion process as a plug-and-play\nmodule, making it adaptable to various community models. For evaluation, we\ncontribute an IFG benchmark and develop a verification pipeline to objectively\ncompare models' abilities to generate instances with accurate positioning and\nfeatures. Experimental results demonstrate that IFAdapter outperforms other\nmodels in both quantitative and qualitative evaluations.\n","authors":["Yinwei Wu","Xianpan Zhou","Bing Ma","Xuefeng Su","Kai Ma","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08232v1","updated":"2024-09-12T17:24:50Z","published":"2024-09-12T17:24:50Z","title":"Model Ensemble for Brain Tumor Segmentation in Magnetic Resonance\n Imaging","summary":" Segmenting brain tumors in multi-parametric magnetic resonance imaging\nenables performing quantitative analysis in support of clinical trials and\npersonalized patient care. This analysis provides the potential to impact\nclinical decision-making processes, including diagnosis and prognosis. In 2023,\nthe well-established Brain Tumor Segmentation (BraTS) challenge presented a\nsubstantial expansion with eight tasks and 4,500 brain tumor cases. In this\npaper, we present a deep learning-based ensemble strategy that is evaluated for\nnewly included tumor cases in three tasks: pediatric brain tumors (PED),\nintracranial meningioma (MEN), and brain metastases (MET). In particular, we\nensemble outputs from state-of-the-art nnU-Net and Swin UNETR models on a\nregion-wise basis. Furthermore, we implemented a targeted post-processing\nstrategy based on a cross-validated threshold search to improve the\nsegmentation results for tumor sub-regions. The evaluation of our proposed\nmethod on unseen test cases for the three tasks resulted in lesion-wise Dice\nscores for PED: 0.653, 0.809, 0.826; MEN: 0.876, 0.867, 0.849; and MET: 0.555,\n0.6, 0.58; for the enhancing tumor, tumor core, and whole tumor, respectively.\nOur method was ranked first for PED, third for MEN, and fourth for MET,\nrespectively.\n","authors":["Daniel Capellán-Martín","Zhifan Jiang","Abhijeet Parida","Xinyang Liu","Van Lam","Hareem Nisar","Austin Tapp","Sarah Elsharkawi","Maria J. Ledesma-Carbayo","Syed Muhammad Anwar","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2409.08232v1.pdf","comment":"11 pages, 6 figures, 2 tables; This method ranked 1st, 3rd and 4th\n for BraTS2023 PED, MEN, and MET, respectively. This paper was accepted at\n MICCAI 2023's BrainLes Workshop"},{"id":"http://arxiv.org/abs/2409.02241v2","updated":"2024-09-12T17:03:27Z","published":"2024-09-03T19:14:01Z","title":"What Makes a Face Look like a Hat: Decoupling Low-level and High-level\n Visual Properties with Image Triplets","summary":" In visual decision making, high-level features, such as object categories,\nhave a strong influence on choice. However, the impact of low-level features on\nbehavior is less understood partly due to the high correlation between high-\nand low-level features in the stimuli presented (e.g., objects of the same\ncategory are more likely to share low-level features). To disentangle these\neffects, we propose a method that de-correlates low- and high-level visual\nproperties in a novel set of stimuli. Our method uses two Convolutional Neural\nNetworks (CNNs) as candidate models of the ventral visual stream: the CORnet-S\nthat has high neural predictivity in high-level, IT-like responses and the\nVGG-16 that has high neural predictivity in low-level responses. Triplets\n(root, image1, image2) of stimuli are parametrized by the level of low- and\nhigh-level similarity of images extracted from the different layers. These\nstimuli are then used in a decision-making task where participants are tasked\nto choose the most similar-to-the-root image. We found that different networks\nshow differing abilities to predict the effects of low-versus-high-level\nsimilarity: while CORnet-S outperforms VGG-16 in explaining human choices based\non high-level similarity, VGG-16 outperforms CORnet-S in explaining human\nchoices based on low-level similarity. Using Brain-Score, we observed that the\nbehavioral prediction abilities of different layers of these networks\nqualitatively corresponded to their ability to explain neural activity at\ndifferent levels of the visual hierarchy. In summary, our algorithm for\nstimulus set generation enables the study of how different representations in\nthe visual stream affect high-level cognitive behaviors.\n","authors":["Maytus Piriyajitakonkij","Sirawaj Itthipuripat","Ian Ballard","Ioannis Pappas"],"pdf_url":"https://arxiv.org/pdf/2409.02241v2.pdf","comment":"Accepted at Workshop on Human-inspired Computer Vision @ ECCV 2024"},{"id":"http://arxiv.org/abs/2409.08215v1","updated":"2024-09-12T16:55:51Z","published":"2024-09-12T16:55:51Z","title":"LT3SD: Latent Trees for 3D Scene Diffusion","summary":" We present LT3SD, a novel latent diffusion model for large-scale 3D scene\ngeneration. Recent advances in diffusion models have shown impressive results\nin 3D object generation, but are limited in spatial extent and quality when\nextended to 3D scenes. To generate complex and diverse 3D scene structures, we\nintroduce a latent tree representation to effectively encode both\nlower-frequency geometry and higher-frequency detail in a coarse-to-fine\nhierarchy. We can then learn a generative diffusion process in this latent 3D\nscene space, modeling the latent components of a scene at each resolution\nlevel. To synthesize large-scale scenes with varying sizes, we train our\ndiffusion model on scene patches and synthesize arbitrary-sized output 3D\nscenes through shared diffusion generation across multiple scene patches.\nThrough extensive experiments, we demonstrate the efficacy and benefits of\nLT3SD for large-scale, high-quality unconditional 3D scene generation and for\nprobabilistic completion for partial scene observations.\n","authors":["Quan Meng","Lei Li","Matthias Nießner","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2409.08215v1.pdf","comment":"Project page: https://quan-meng.github.io/projects/lt3sd/ Video:\n https://youtu.be/AJ5sG9VyjGA"},{"id":"http://arxiv.org/abs/2409.08207v1","updated":"2024-09-12T16:47:57Z","published":"2024-09-12T16:47:57Z","title":"VI3DRM:Towards meticulous 3D Reconstruction from Sparse Views via\n Photo-Realistic Novel View Synthesis","summary":" Recently, methods like Zero-1-2-3 have focused on single-view based 3D\nreconstruction and have achieved remarkable success. However, their predictions\nfor unseen areas heavily rely on the inductive bias of large-scale pretrained\ndiffusion models. Although subsequent work, such as DreamComposer, attempts to\nmake predictions more controllable by incorporating additional views, the\nresults remain unrealistic due to feature entanglement in the vanilla latent\nspace, including factors such as lighting, material, and structure. To address\nthese issues, we introduce the Visual Isotropy 3D Reconstruction Model\n(VI3DRM), a diffusion-based sparse views 3D reconstruction model that operates\nwithin an ID consistent and perspective-disentangled 3D latent space. By\nfacilitating the disentanglement of semantic information, color, material\nproperties and lighting, VI3DRM is capable of generating highly realistic\nimages that are indistinguishable from real photographs. By leveraging both\nreal and synthesized images, our approach enables the accurate construction of\npointmaps, ultimately producing finely textured meshes or point clouds. On the\nNVS task, tested on the GSO dataset, VI3DRM significantly outperforms\nstate-of-the-art method DreamComposer, achieving a PSNR of 38.61, an SSIM of\n0.929, and an LPIPS of 0.027. Code will be made available upon publication.\n","authors":["Hao Chen","Jiafu Wu","Ying Jin","Jinlong Peng","Xiaofeng Mao","Mingmin Chi","Mufeng Yao","Bo Peng","Jian Li","Yun Cao"],"pdf_url":"https://arxiv.org/pdf/2409.08207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08206v1","updated":"2024-09-12T16:46:41Z","published":"2024-09-12T16:46:41Z","title":"ComAlign: Compositional Alignment in Vision-Language Models","summary":" Vision-language models (VLMs) like CLIP have showcased a remarkable ability\nto extract transferable features for downstream tasks. Nonetheless, the\ntraining process of these models is usually based on a coarse-grained\ncontrastive loss between the global embedding of images and texts which may\nlose the compositional structure of these modalities. Many recent studies have\nshown VLMs lack compositional understandings like attribute binding and\nidentifying object relationships. Although some recent methods have tried to\nachieve finer-level alignments, they either are not based on extracting\nmeaningful components of proper granularity or don't properly utilize the\nmodalities' correspondence (especially in image-text pairs with more\ningredients). Addressing these limitations, we introduce Compositional\nAlignment (ComAlign), a fine-grained approach to discover more exact\ncorrespondence of text and image components using only the weak supervision in\nthe form of image-text pairs. Our methodology emphasizes that the compositional\nstructure (including entities and relations) extracted from the text modality\nmust also be retained in the image modality. To enforce correspondence of\nfine-grained concepts in image and text modalities, we train a lightweight\nnetwork lying on top of existing visual and language encoders using a small\ndataset. The network is trained to align nodes and edges of the structure\nacross the modalities. Experimental results on various VLMs and datasets\ndemonstrate significant improvements in retrieval and compositional benchmarks,\naffirming the effectiveness of our plugin model.\n","authors":["Ali Abdollah","Amirmohammad Izadi","Armin Saghafian","Reza Vahidimajd","Mohammad Mozafari","Amirreza Mirzaei","Mohammadmahdi Samiei","Mahdieh Soleymani Baghshah"],"pdf_url":"https://arxiv.org/pdf/2409.08206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08202v1","updated":"2024-09-12T16:41:47Z","published":"2024-09-12T16:41:47Z","title":"What Makes a Maze Look Like a Maze?","summary":" A unique aspect of human visual understanding is the ability to flexibly\ninterpret abstract concepts: acquiring lifted rules explaining what they\nsymbolize, grounding them across familiar and unfamiliar contexts, and making\npredictions or reasoning about them. While off-the-shelf vision-language models\nexcel at making literal interpretations of images (e.g., recognizing object\ncategories such as tree branches), they still struggle to make sense of such\nvisual abstractions (e.g., how an arrangement of tree branches may form the\nwalls of a maze). To address this challenge, we introduce Deep Schema Grounding\n(DSG), a framework that leverages explicit structured representations of visual\nabstractions for grounding and reasoning. At the core of DSG are\nschemas--dependency graph descriptions of abstract concepts that decompose them\ninto more primitive-level symbols. DSG uses large language models to extract\nschemas, then hierarchically grounds concrete to abstract components of the\nschema onto images with vision-language models. The grounded schema is used to\naugment visual abstraction understanding. We systematically evaluate DSG and\ndifferent methods in reasoning on our new Visual Abstractions Dataset, which\nconsists of diverse, real-world images of abstract concepts and corresponding\nquestion-answer pairs labeled by humans. We show that DSG significantly\nimproves the abstract visual reasoning performance of vision-language models,\nand is a step toward human-aligned understanding of visual abstractions.\n","authors":["Joy Hsu","Jiayuan Mao","Joshua B. Tenenbaum","Noah D. Goodman","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2409.08202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18100v3","updated":"2024-09-12T16:32:48Z","published":"2024-07-25T15:03:36Z","title":"DINOv2 Rocks Geological Image Analysis: Classification, Segmentation,\n and Interpretability","summary":" Recent advancements in computer vision have significantly improved image\nanalysis tasks. Yet, deep learning models often struggle when applied to\ndomains outside their training distribution, such as in geosciences, where\ndomain-specific data can be scarce. This study investigates the classification,\nsegmentation, and interpretability of CT-scan images of rock samples, focusing\non the application of modern computer vision techniques to geoscientific tasks.\nWe compare a range of segmentation methods to assess their efficacy,\nefficiency, and adaptability in geological image analysis. The methods\nevaluated include Otsu thresholding, clustering techniques (K-means, fuzzy\nC-means), a supervised machine learning approach (Random Forest), and deep\nlearning models (UNet, ResNet152, and DINOv2), using ten binary sandstone\ndatasets and three multi-class calcite datasets. DINOv2 was selected for its\npromising results in feature extraction and its potential applicability in\ngeoscientific tasks, prompting further assessment of its interpretability and\neffectiveness in processing CT-scanned rock data. For classification, a\nnon-fine-tuned DINOv2 demonstrates strong performance in classifying rock\nimages, even when the CT-scans are outside its original training set. In\nsegmentation tasks, thresholding and clustering techniques, though\ncomputationally efficient, produce subpar results despite preprocessing\nefforts. In contrast, supervised methods achieve better performance. While deep\nlearning methods demand greater computational resources, they require minimal\nintervention and offer superior generalization. A LoRA fine-tuned DINOv2, in\nparticular, excels in out-of-distribution segmentation and outperforms other\nmethods in multi-class tasks, even with limited data. Notably, the segmentation\nmasks generated by DINOv2 often appear more accurate than the original targets,\nbased on visual inspection.\n","authors":["Florent Brondolo","Samuel Beaussant"],"pdf_url":"https://arxiv.org/pdf/2407.18100v3.pdf","comment":"Minor typos fixing, added figures and revisions of some sections"},{"id":"http://arxiv.org/abs/2409.08189v1","updated":"2024-09-12T16:26:47Z","published":"2024-09-12T16:26:47Z","title":"Gaussian Garments: Reconstructing Simulation-Ready Clothing with\n Photorealistic Appearance from Multi-View Video","summary":" We introduce Gaussian Garments, a novel approach for reconstructing realistic\nsimulation-ready garment assets from multi-view videos. Our method represents\ngarments with a combination of a 3D mesh and a Gaussian texture that encodes\nboth the color and high-frequency surface details. This representation enables\naccurate registration of garment geometries to multi-view videos and helps\ndisentangle albedo textures from lighting effects. Furthermore, we demonstrate\nhow a pre-trained graph neural network (GNN) can be fine-tuned to replicate the\nreal behavior of each garment. The reconstructed Gaussian Garments can be\nautomatically combined into multi-garment outfits and animated with the\nfine-tuned GNN.\n","authors":["Boxiang Rong","Artur Grigorev","Wenbo Wang","Michael J. Black","Bernhard Thomaszewski","Christina Tsalicoglou","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2409.08189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08181v1","updated":"2024-09-12T16:13:07Z","published":"2024-09-12T16:13:07Z","title":"Enhancing Canine Musculoskeletal Diagnoses: Leveraging Synthetic Image\n Data for Pre-Training AI-Models on Visual Documentations","summary":" The examination of the musculoskeletal system in dogs is a challenging task\nin veterinary practice. In this work, a novel method has been developed that\nenables efficient documentation of a dog's condition through a visual\nrepresentation. However, since the visual documentation is new, there is no\nexisting training data. The objective of this work is therefore to mitigate the\nimpact of data scarcity in order to develop an AI-based diagnostic support\nsystem. To this end, the potential of synthetic data that mimics realistic\nvisual documentations of diseases for pre-training AI models is investigated.\nWe propose a method for generating synthetic image data that mimics realistic\nvisual documentations. Initially, a basic dataset containing three distinct\nclasses is generated, followed by the creation of a more sophisticated dataset\ncontaining 36 different classes. Both datasets are used for the pre-training of\nan AI model. Subsequently, an evaluation dataset is created, consisting of 250\nmanually created visual documentations for five different diseases. This\ndataset, along with a subset containing 25 examples. The obtained results on\nthe evaluation dataset containing 25 examples demonstrate a significant\nenhancement of approximately 10% in diagnosis accuracy when utilizing generated\nsynthetic images that mimic real-world visual documentations. However, these\nresults do not hold true for the larger evaluation dataset containing 250\nexamples, indicating that the advantages of using synthetic data for\npre-training an AI model emerge primarily when dealing with few examples of\nvisual documentations for a given disease. Overall, this work provides valuable\ninsights into mitigating the limitations imposed by limited training data\nthrough the strategic use of generated synthetic data, presenting an approach\napplicable beyond the canine musculoskeletal assessment domain.\n","authors":["Martin Thißen","Thi Ngoc Diep Tran","Ben Joel Schönbein","Ute Trapp","Barbara Esteve Ratsch","Beate Egner","Romana Piat","Elke Hergenröther"],"pdf_url":"https://arxiv.org/pdf/2409.08181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08171v1","updated":"2024-09-12T16:03:56Z","published":"2024-09-12T16:03:56Z","title":"Low-Cost Tree Crown Dieback Estimation Using Deep Learning-Based\n Segmentation","summary":" The global increase in observed forest dieback, characterised by the death of\ntree foliage, heralds widespread decline in forest ecosystems. This degradation\ncauses significant changes to ecosystem services and functions, including\nhabitat provision and carbon sequestration, which can be difficult to detect\nusing traditional monitoring techniques, highlighting the need for large-scale\nand high-frequency monitoring. Contemporary developments in the instruments and\nmethods to gather and process data at large-scales mean this monitoring is now\npossible. In particular, the advancement of low-cost drone technology and deep\nlearning on consumer-level hardware provide new opportunities. Here, we use an\napproach based on deep learning and vegetation indices to assess crown dieback\nfrom RGB aerial data without the need for expensive instrumentation such as\nLiDAR. We use an iterative approach to match crown footprints predicted by deep\nlearning with field-based inventory data from a Mediterranean ecosystem\nexhibiting drought-induced dieback, and compare expert field-based crown\ndieback estimation with vegetation index-based estimates. We obtain high\noverall segmentation accuracy (mAP: 0.519) without the need for additional\ntechnical development of the underlying Mask R-CNN model, underscoring the\npotential of these approaches for non-expert use and proving their\napplicability to real-world conservation. We also find colour-coordinate based\nestimates of dieback correlate well with expert field-based estimation.\nSubstituting ground truth for Mask R-CNN model predictions showed negligible\nimpact on dieback estimates, indicating robustness. Our findings demonstrate\nthe potential of automated data collection and processing, including the\napplication of deep learning, to improve the coverage, speed and cost of forest\ndieback monitoring.\n","authors":["M. J. Allen","D. Moreno-Fernández","P. Ruiz-Benito","S. W. D. Grieve","E. R. Lines"],"pdf_url":"https://arxiv.org/pdf/2409.08171v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.08170v1","updated":"2024-09-12T16:00:51Z","published":"2024-09-12T16:00:51Z","title":"AD-Lite Net: A Lightweight and Concatenated CNN Model for Alzheimer's\n Detection from MRI Images","summary":" Alzheimer's Disease (AD) is a non-curable progressive neurodegenerative\ndisorder that affects the human brain, leading to a decline in memory,\ncognitive abilities, and eventually, the ability to carry out daily tasks.\nManual diagnosis of Alzheimer's disease from MRI images is fraught with less\nsensitivity and it is a very tedious process for neurologists. Therefore, there\nis a need for an automatic Computer Assisted Diagnosis (CAD) system, which can\ndetect AD at early stages with higher accuracy. In this research, we have\nproposed a novel AD-Lite Net model (trained from scratch), that could alleviate\nthe aforementioned problem. The novelties we bring here in this research are,\n(I) We have proposed a very lightweight CNN model by incorporating Depth Wise\nSeparable Convolutional (DWSC) layers and Global Average Pooling (GAP) layers.\n(II) We have leveraged a ``parallel concatenation block'' (pcb), in the\nproposed AD-Lite Net model. This pcb consists of a Transformation layer\n(Tx-layer), followed by two convolutional layers, which are thereby\nconcatenated with the original base model. This Tx-layer converts the features\ninto very distinct kind of features, which are imperative for the Alzheimer's\ndisease. As a consequence, the proposed AD-Lite Net model with ``parallel\nconcatenation'' converges faster and automatically mitigates the class\nimbalance problem from the MRI datasets in a very generalized way. For the\nvalidity of our proposed model, we have implemented it on three different MRI\ndatasets. Furthermore, we have combined the ADNI and AD datasets and\nsubsequently performed a 10-fold cross-validation experiment to verify the\nmodel's generalization ability. Extensive experimental results showed that our\nproposed model has outperformed all the existing CNN models, and one recent\ntrend Vision Transformer (ViT) model by a significant margin.\n","authors":["Santanu Roy","Archit Gupta","Shubhi Tiwari","Palak Sahu"],"pdf_url":"https://arxiv.org/pdf/2409.08170v1.pdf","comment":"NA"},{"id":"http://arxiv.org/abs/2409.08169v1","updated":"2024-09-12T16:00:22Z","published":"2024-09-12T16:00:22Z","title":"Learning to Match 2D Keypoints Across Preoperative MR and Intraoperative\n Ultrasound","summary":" We propose in this paper a texture-invariant 2D keypoints descriptor\nspecifically designed for matching preoperative Magnetic Resonance (MR) images\nwith intraoperative Ultrasound (US) images. We introduce a\nmatching-by-synthesis strategy, where intraoperative US images are synthesized\nfrom MR images accounting for multiple MR modalities and intraoperative US\nvariability. We build our training set by enforcing keypoints localization over\nall images then train a patient-specific descriptor network that learns\ntexture-invariant discriminant features in a supervised contrastive manner,\nleading to robust keypoints descriptors. Our experiments on real cases with\nground truth show the effectiveness of the proposed approach, outperforming the\nstate-of-the-art methods and achieving 80.35% matching precision on average.\n","authors":["Hassan Rasheed","Reuben Dorent","Maximilian Fehrentz","Tina Kapur","William M. Wells III","Alexandra Golby","Sarah Frisken","Julia A. Schnabel","Nazim Haouchine"],"pdf_url":"https://arxiv.org/pdf/2409.08169v1.pdf","comment":"Accepted for publication at the International Workshop of Advances in\n Simplifying Medical UltraSound (ASMUS) at MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.04747v3","updated":"2024-09-12T16:00:08Z","published":"2024-09-07T07:44:33Z","title":"Explicit Mutual Information Maximization for Self-Supervised Learning","summary":" Recently, self-supervised learning (SSL) has been extensively studied.\nTheoretically, mutual information maximization (MIM) is an optimal criterion\nfor SSL, with a strong theoretical foundation in information theory. However,\nit is difficult to directly apply MIM in SSL since the data distribution is not\nanalytically available in applications. In practice, many existing methods can\nbe viewed as approximate implementations of the MIM criterion. This work shows\nthat, based on the invariance property of MI, explicit MI maximization can be\napplied to SSL under a generic distribution assumption, i.e., a relaxed\ncondition of the data distribution. We further illustrate this by analyzing the\ngeneralized Gaussian distribution. Based on this result, we derive a loss\nfunction based on the MIM criterion using only second-order statistics. We\nimplement the new loss for SSL and demonstrate its effectiveness via extensive\nexperiments.\n","authors":["Lele Chang","Peilin Liu","Qinghai Guo","Fei Wen"],"pdf_url":"https://arxiv.org/pdf/2409.04747v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08167v1","updated":"2024-09-12T15:58:28Z","published":"2024-09-12T15:58:28Z","title":"High-Frequency Anti-DreamBooth: Robust Defense Against Image Synthesis","summary":" Recently, text-to-image generative models have been misused to create\nunauthorized malicious images of individuals, posing a growing social problem.\nPrevious solutions, such as Anti-DreamBooth, add adversarial noise to images to\nprotect them from being used as training data for malicious generation.\nHowever, we found that the adversarial noise can be removed by adversarial\npurification methods such as DiffPure. Therefore, we propose a new adversarial\nattack method that adds strong perturbation on the high-frequency areas of\nimages to make it more robust to adversarial purification. Our experiment\nshowed that the adversarial images retained noise even after adversarial\npurification, hindering malicious image generation.\n","authors":["Takuto Onikubo","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2409.08167v1.pdf","comment":"ECCV 2024 Workshop The Dark Side of Generative AIs and Beyond"},{"id":"http://arxiv.org/abs/2409.08163v1","updated":"2024-09-12T15:56:17Z","published":"2024-09-12T15:56:17Z","title":"Open Source Infrastructure for Automatic Cell Segmentation","summary":" Automated cell segmentation is crucial for various biological and medical\napplications, facilitating tasks like cell counting, morphology analysis, and\ndrug discovery. However, manual segmentation is time-consuming and prone to\nsubjectivity, necessitating robust automated methods. This paper presents\nopen-source infrastructure, utilizing the UNet model, a deep-learning\narchitecture noted for its effectiveness in image segmentation tasks. This\nimplementation is integrated into the open-source DeepChem package, enhancing\naccessibility and usability for researchers and practitioners. The resulting\ntool offers a convenient and user-friendly interface, reducing the barrier to\nentry for cell segmentation while maintaining high accuracy. Additionally, we\nbenchmark this model against various datasets, demonstrating its robustness and\nversatility across different imaging conditions and cell types.\n","authors":["Aaron Rock Menezes","Bharath Ramsundar"],"pdf_url":"https://arxiv.org/pdf/2409.08163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08162v1","updated":"2024-09-12T15:55:39Z","published":"2024-09-12T15:55:39Z","title":"Cross-Attention Based Influence Model for Manual and Nonmanual Sign\n Language Analysis","summary":" Both manual (relating to the use of hands) and non-manual markers (NMM), such\nas facial expressions or mouthing cues, are important for providing the\ncomplete meaning of phrases in American Sign Language (ASL). Efforts have been\nmade in advancing sign language to spoken/written language understanding, but\nmost of these have primarily focused on manual features only. In this work,\nusing advanced neural machine translation methods, we examine and report on the\nextent to which facial expressions contribute to understanding sign language\nphrases. We present a sign language translation architecture consisting of\ntwo-stream encoders, with one encoder handling the face and the other handling\nthe upper body (with hands). We propose a new parallel cross-attention decoding\nmechanism that is useful for quantifying the influence of each input modality\non the output. The two streams from the encoder are directed simultaneously to\ndifferent attention stacks in the decoder. Examining the properties of the\nparallel cross-attention weights allows us to analyze the importance of facial\nmarkers compared to body and hand features during a translating task.\n","authors":["Lipisha Chaudhary","Fei Xu","Ifeoma Nwogu"],"pdf_url":"https://arxiv.org/pdf/2409.08162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08159v1","updated":"2024-09-12T15:52:08Z","published":"2024-09-12T15:52:08Z","title":"SDformer: Efficient End-to-End Transformer for Depth Completion","summary":" Depth completion aims to predict dense depth maps with sparse depth\nmeasurements from a depth sensor. Currently, Convolutional Neural Network (CNN)\nbased models are the most popular methods applied to depth completion tasks.\nHowever, despite the excellent high-end performance, they suffer from a limited\nrepresentation area. To overcome the drawbacks of CNNs, a more effective and\npowerful method has been presented: the Transformer, which is an adaptive\nself-attention setting sequence-to-sequence model. While the standard\nTransformer quadratically increases the computational cost from the key-query\ndot-product of input resolution which improperly employs depth completion\ntasks. In this work, we propose a different window-based Transformer\narchitecture for depth completion tasks named Sparse-to-Dense Transformer\n(SDformer). The network consists of an input module for the depth map and RGB\nimage features extraction and concatenation, a U-shaped encoder-decoder\nTransformer for extracting deep features, and a refinement module.\nSpecifically, we first concatenate the depth map features with the RGB image\nfeatures through the input model. Then, instead of calculating self-attention\nwith the whole feature maps, we apply different window sizes to extract the\nlong-range depth dependencies. Finally, we refine the predicted features from\nthe input module and the U-shaped encoder-decoder Transformer module to get the\nenriching depth features and employ a convolution layer to obtain the dense\ndepth map. In practice, the SDformer obtains state-of-the-art results against\nthe CNN-based depth completion models with lower computing loads and parameters\non the NYU Depth V2 and KITTI DC datasets.\n","authors":["Jian Qian","Miao Sun","Ashley Lee","Jie Li","Shenglong Zhuo","Patrick Yin Chiang"],"pdf_url":"https://arxiv.org/pdf/2409.08159v1.pdf","comment":"Presented at the International Conference on Industrial Automation,\n Robotics and Control Engineering (IARCE) 2022"},{"id":"http://arxiv.org/abs/2409.08156v1","updated":"2024-09-12T15:51:09Z","published":"2024-09-12T15:51:09Z","title":"MagicStyle: Portrait Stylization Based on Reference Image","summary":" The development of diffusion models has significantly advanced the research\non image stylization, particularly in the area of stylizing a content image\nbased on a given style image, which has attracted many scholars. The main\nchallenge in this reference image stylization task lies in how to maintain the\ndetails of the content image while incorporating the color and texture features\nof the style image. This challenge becomes even more pronounced when the\ncontent image is a portrait which has complex textural details. To address this\nchallenge, we propose a diffusion model-based reference image stylization\nmethod specifically for portraits, called MagicStyle. MagicStyle consists of\ntwo phases: Content and Style DDIM Inversion (CSDI) and Feature Fusion Forward\n(FFF). The CSDI phase involves a reverse denoising process, where DDIM\nInversion is performed separately on the content image and the style image,\nstoring the self-attention query, key and value features of both images during\nthe inversion process. The FFF phase executes forward denoising, harmoniously\nintegrating the texture and color information from the pre-stored feature\nqueries, keys and values into the diffusion generation process based on our\nWell-designed Feature Fusion Attention (FFA). We conducted comprehensive\ncomparative and ablation experiments to validate the effectiveness of our\nproposed MagicStyle and FFA.\n","authors":["Zhaoli Deng","Kaibin Zhou","Fanyi Wang","Zhenpeng Mi"],"pdf_url":"https://arxiv.org/pdf/2409.08156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02369v3","updated":"2024-09-12T15:46:58Z","published":"2024-08-05T10:38:50Z","title":"The NPU-ASLP System Description for Visual Speech Recognition in CNVSRC\n 2024","summary":" This paper delineates the visual speech recognition (VSR) system introduced\nby the NPU-ASLP (Team 237) in the second Chinese Continuous Visual Speech\nRecognition Challenge (CNVSRC 2024), engaging in all four tracks, including the\nfixed and open tracks of Single-Speaker VSR Task and Multi-Speaker VSR Task. In\nterms of data processing, we leverage the lip motion extractor from the\nbaseline1 to produce multiscale video data. Besides, various augmentation\ntechniques are applied during training, encompassing speed perturbation, random\nrotation, horizontal flipping, and color transformation. The VSR model adopts\nan end-to-end architecture with joint CTC/attention loss, introducing Enhanced\nResNet3D visual frontend, E-Branchformer encoder, and Bi-directional\nTransformer decoder. Our approach yields a 30.47% CER for the Single-Speaker\nTask and 34.30% CER for the Multi-Speaker Task, securing second place in the\nopen track of the Single-Speaker Task and first place in the other three\ntracks.\n","authors":["He Wang","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2408.02369v3.pdf","comment":"Included in CNVSRC Workshop 2024, NCMMSC 2024"},{"id":"http://arxiv.org/abs/2409.04631v2","updated":"2024-09-12T15:37:30Z","published":"2024-09-06T21:43:00Z","title":"Zero-Shot Whole Slide Image Retrieval in Histopathology Using Embeddings\n of Foundation Models","summary":" We have tested recently published foundation models for histopathology for\nimage retrieval. We report macro average of F1 score for top-1 retrieval,\nmajority of top-3 retrievals, and majority of top-5 retrievals. We perform\nzero-shot retrievals, i.e., we do not alter embeddings and we do not train any\nclassifier. As test data, we used diagnostic slides of TCGA, The Cancer Genome\nAtlas, consisting of 23 organs and 117 cancer subtypes. As a search platform we\nused Yottixel that enabled us to perform WSI search using patches. Achieved F1\nscores show low performance, e.g., for top-5 retrievals, 27% +/- 13%\n(Yottixel-DenseNet), 42% +/- 14% (Yottixel-UNI), 40%+/-13% (Yottixel-Virchow),\n41%+/-13% (Yottixel-GigaPath), and 41%+/-14% (GigaPath WSI).\n","authors":["Saghir Alfasly","Ghazal Alabtah","Sobhan Hemati","Krishna Rani Kalari","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2409.04631v2.pdf","comment":"This paper will be updated with more results"},{"id":"http://arxiv.org/abs/2409.08143v1","updated":"2024-09-12T15:34:31Z","published":"2024-09-12T15:34:31Z","title":"Effective Segmentation of Post-Treatment Gliomas Using Simple\n Approaches: Artificial Sequence Generation and Ensemble Models","summary":" Segmentation is a crucial task in the medical imaging field and is often an\nimportant primary step or even a prerequisite to the analysis of medical\nvolumes. Yet treatments such as surgery complicate the accurate delineation of\nregions of interest. The BraTS Post-Treatment 2024 Challenge published the\nfirst public dataset for post-surgery glioma segmentation and addresses the\naforementioned issue by fostering the development of automated segmentation\ntools for glioma in MRI data. In this effort, we propose two straightforward\napproaches to enhance the segmentation performances of deep learning-based\nmethodologies. First, we incorporate an additional input based on a simple\nlinear combination of the available MRI sequences input, which highlights\nenhancing tumors. Second, we employ various ensembling methods to weigh the\ncontribution of a battery of models. Our results demonstrate that these\napproaches significantly improve segmentation performance compared to baseline\nmodels, underscoring the effectiveness of these simple approaches in improving\nmedical image segmentation tasks.\n","authors":["Heejong Kim","Leo Milecki","Mina C Moghadam","Fengbei Liu","Minh Nguyen","Eric Qiu","Abhishek Thanki","Mert R Sabuncu"],"pdf_url":"https://arxiv.org/pdf/2409.08143v1.pdf","comment":"Invited for an Oral Presentation at the MICCAI BraTS Challenge 2024"},{"id":"http://arxiv.org/abs/2409.05587v2","updated":"2024-09-12T15:24:44Z","published":"2024-09-09T13:16:15Z","title":"DSDFormer: An Innovative Transformer-Mamba Framework for Robust\n High-Precision Driver Distraction Identification","summary":" Driver distraction remains a leading cause of traffic accidents, posing a\ncritical threat to road safety globally. As intelligent transportation systems\nevolve, accurate and real-time identification of driver distraction has become\nessential. However, existing methods struggle to capture both global contextual\nand fine-grained local features while contending with noisy labels in training\ndatasets. To address these challenges, we propose DSDFormer, a novel framework\nthat integrates the strengths of Transformer and Mamba architectures through a\nDual State Domain Attention (DSDA) mechanism, enabling a balance between\nlong-range dependencies and detailed feature extraction for robust driver\nbehavior recognition. Additionally, we introduce Temporal Reasoning Confident\nLearning (TRCL), an unsupervised approach that refines noisy labels by\nleveraging spatiotemporal correlations in video sequences. Our model achieves\nstate-of-the-art performance on the AUC-V1, AUC-V2, and 100-Driver datasets and\ndemonstrates real-time processing efficiency on the NVIDIA Jetson AGX Orin\nplatform. Extensive experimental results confirm that DSDFormer and TRCL\nsignificantly improve both the accuracy and robustness of driver distraction\ndetection, offering a scalable solution to enhance road safety.\n","authors":["Junzhou Chen","Zirui Zhang","Jing Yu","Heqiang Huang","Ronghui Zhang","Xuemiao Xu","Bin Sheng","Hong Yan"],"pdf_url":"https://arxiv.org/pdf/2409.05587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08130v1","updated":"2024-09-12T15:20:23Z","published":"2024-09-12T15:20:23Z","title":"The JPEG Pleno Learning-based Point Cloud Coding Standard: Serving Man\n and Machine","summary":" Efficient point cloud coding has become increasingly critical for multiple\napplications such as virtual reality, autonomous driving, and digital twin\nsystems, where rich and interactive 3D data representations may functionally\nmake the difference. Deep learning has emerged as a powerful tool in this\ndomain, offering advanced techniques for compressing point clouds more\nefficiently than conventional coding methods while also allowing effective\ncomputer vision tasks performed in the compressed domain thus, for the first\ntime, making available a common compressed visual representation effective for\nboth man and machine. Taking advantage of this potential, JPEG has recently\nfinalized the JPEG Pleno Learning-based Point Cloud Coding (PCC) standard\noffering efficient lossy coding of static point clouds, targeting both human\nvisualization and machine processing by leveraging deep learning models for\ngeometry and color coding. The geometry is processed directly in its original\n3D form using sparse convolutional neural networks, while the color data is\nprojected onto 2D images and encoded using the also learning-based JPEG AI\nstandard. The goal of this paper is to provide a complete technical description\nof the JPEG PCC standard, along with a thorough benchmarking of its performance\nagainst the state-of-the-art, while highlighting its main strengths and\nweaknesses. In terms of compression performance, JPEG PCC outperforms the\nconventional MPEG PCC standards, especially in geometry coding, achieving\nsignificant rate reductions. Color compression performance is less competitive\nbut this is overcome by the power of a full learning-based coding framework for\nboth geometry and color and the associated effective compressed domain\nprocessing.\n","authors":["André F. R. Guarda","Nuno M. M. Rodrigues","Fernando Pereira"],"pdf_url":"https://arxiv.org/pdf/2409.08130v1.pdf","comment":"28 pages, 12 figures, submitted to IEEE Access"},{"id":"http://arxiv.org/abs/2311.13254v3","updated":"2024-09-12T15:16:24Z","published":"2023-11-22T09:18:49Z","title":"Unified Domain Adaptive Semantic Segmentation","summary":" Unsupervised Domain Adaptive Semantic Segmentation (UDA-SS) aims to transfer\nthe supervision from a labeled source domain to an unlabeled target domain. The\nmajority of existing UDA-SS works typically consider images whilst recent\nattempts have extended further to tackle videos by modeling the temporal\ndimension. Although the two lines of research share the major challenges --\novercoming the underlying domain distribution shift, their studies are largely\nindependent, resulting in fragmented insights, a lack of holistic\nunderstanding, and missed opportunities for cross-pollination of ideas. This\nfragmentation prevents the unification of methods, leading to redundant efforts\nand suboptimal knowledge transfer across image and video domains. Under this\nobservation, we advocate unifying the study of UDA-SS across video and image\nscenarios, enabling a more comprehensive understanding, synergistic\nadvancements, and efficient knowledge sharing. To that end, we explore the\nunified UDA-SS from a general data augmentation perspective, serving as a\nunifying conceptual framework, enabling improved generalization, and potential\nfor cross-pollination of ideas, ultimately contributing to the overall progress\nand practical impact of this field of research. Specifically, we propose a\nQuad-directional Mixup (QuadMix) method, characterized by tackling distinct\npoint attributes and feature inconsistencies through four-directional paths for\nintra- and inter-domain mixing in a feature space. To deal with temporal shifts\nwith videos, we incorporate optical flow-guided feature aggregation across\nspatial and temporal dimensions for fine-grained domain alignment. Extensive\nexperiments show that our method outperforms the state-of-the-art works by\nlarge margins on four challenging UDA-SS benchmarks. Our source code and models\nwill be released at \\url{https://github.com/ZHE-SAPI/UDASS}.\n","authors":["Zhe Zhang","Gaochang Wu","Jing Zhang","Xiatian Zhu","Dacheng Tao","Tianyou Chai"],"pdf_url":"https://arxiv.org/pdf/2311.13254v3.pdf","comment":"17 pages,11 figures, 11 tables, submitted to IEEE Transactions on\n Pattern Analysis and Machine Intelligence"},{"id":"http://arxiv.org/abs/2302.10763v4","updated":"2024-09-12T15:13:26Z","published":"2023-02-12T12:19:57Z","title":"Contrastive Learning and the Emergence of Attributes Associations","summary":" In response to an object presentation, supervised learning schemes generally\nrespond with a parsimonious label. Upon a similar presentation we humans\nrespond again with a label, but are flooded, in addition, by a myriad of\nassociations. A significant portion of these consist of the presented object\nattributes. Contrastive learning is a semi-supervised learning scheme based on\nthe application of identity preserving transformations on the object input\nrepresentations. It is conjectured in this work that these same applied\ntransformations preserve, in addition to the identity of the presented object,\nalso the identity of its semantically meaningful attributes. The corollary of\nthis is that the output representations of such a contrastive learning scheme\ncontain valuable information not only for the classification of the presented\nobject, but also for the presence or absence decision of any attribute of\ninterest. Simulation results which demonstrate this idea and the feasibility of\nthis conjecture are presented.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2302.10763v4.pdf","comment":"10 pages"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.06096v2","updated":"2024-09-12T19:13:17Z","published":"2024-09-09T22:16:48Z","title":"Latent Diffusion Bridges for Unsupervised Musical Audio Timbre Transfer","summary":" Music timbre transfer is a challenging task that involves modifying the\ntimbral characteristics of an audio signal while preserving its melodic\nstructure. In this paper, we propose a novel method based on dual diffusion\nbridges, trained using the CocoChorales Dataset, which consists of unpaired\nmonophonic single-instrument audio data. Each diffusion model is trained on a\nspecific instrument with a Gaussian prior. During inference, a model is\ndesignated as the source model to map the input audio to its corresponding\nGaussian prior, and another model is designated as the target model to\nreconstruct the target audio from this Gaussian prior, thereby facilitating\ntimbre transfer. We compare our approach against existing unsupervised timbre\ntransfer models such as VAEGAN and Gaussian Flow Bridges (GFB). Experimental\nresults demonstrate that our method achieves both better Fr\\'echet Audio\nDistance (FAD) and melody preservation, as reflected by lower pitch distances\n(DPD) compared to VAEGAN and GFB. Additionally, we discover that the noise\nlevel from the Gaussian prior, $\\sigma$, can be adjusted to control the degree\nof melody preservation and amount of timbre transferred.\n","authors":["Michele Mancusi","Yurii Halychanskyi","Kin Wai Cheuk","Chieh-Hsin Lai","Stefan Uhlich","Junghyun Koo","Marco A. Martínez-Ramírez","Wei-Hsiang Liao","Giorgio Fabbro","Yuhki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2409.06096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13378v2","updated":"2024-09-12T16:06:37Z","published":"2024-08-23T21:24:59Z","title":"DrugAgent: Explainable Drug Repurposing Agent with Large Language\n Model-based Reasoning","summary":" Drug repurposing offers a promising avenue for accelerating drug development\nby identifying new therapeutic potentials of existing drugs. In this paper, we\npropose a multi-agent framework to enhance the drug repurposing process using\nstate-of-the-art machine learning techniques and knowledge integration. Our\nframework comprises several specialized agents: an AI Agent trains robust\ndrug-target interaction (DTI) models; a Knowledge Graph Agent utilizes the\ndrug-gene interaction database (DGIdb), DrugBank, Comparative Toxicogenomics\nDatabase (CTD), and Search Tool for Interactions of Chemicals (STITCH) to\nsystematically extract DTIs; and a Search Agent interacts with biomedical\nliterature to annotate and verify computational predictions. By integrating\noutputs from these agents, our system effectively harnesses diverse data\nsources, including external databases, to propose viable repurposing\ncandidates. Preliminary results demonstrate the potential of our approach in\nnot only predicting drug-disease interactions but also in reducing the time and\ncost associated with traditional drug discovery methods. This paper highlights\nthe scalability of multi-agent systems in biomedical research and their role in\ndriving innovation in drug repurposing. Our approach not only outperforms\nexisting methods in predicting drug repurposing potential but also provides\ninterpretable results, paving the way for more efficient and cost-effective\ndrug discovery processes.\n","authors":["Yoshitaka Inoue","Tianci Song","Tianfan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.13378v2.pdf","comment":"18 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.08046v1","updated":"2024-09-12T13:51:06Z","published":"2024-09-12T13:51:06Z","title":"On the challenges of studying bias in Recommender Systems: A UserKNN\n case study","summary":" Statements on the propagation of bias by recommender systems are often hard\nto verify or falsify. Research on bias tends to draw from a small pool of\npublicly available datasets and is therefore bound by their specific\nproperties. Additionally, implementation choices are often not explicitly\ndescribed or motivated in research, while they may have an effect on bias\npropagation. In this paper, we explore the challenges of measuring and\nreporting popularity bias. We showcase the impact of data properties and\nalgorithm configurations on popularity bias by combining synthetic data with\nwell known recommender systems frameworks that implement UserKNN. First, we\nidentify data characteristics that might impact popularity bias, based on the\nfunctionality of UserKNN. Accordingly, we generate various datasets that\ncombine these characteristics. Second, we locate UserKNN configurations that\nvary across implementations in literature. We evaluate popularity bias for five\nsynthetic datasets and five UserKNN configurations, and offer insights on their\njoint effect. We find that, depending on the data characteristics, various\nUserKNN configurations can lead to different conclusions regarding the\npropagation of popularity bias. These results motivate the need for explicitly\naddressing algorithmic configuration and data properties when reporting and\ninterpreting bias in recommender systems.\n","authors":["Savvina Daniil","Manel Slokom","Mirjam Cuper","Cynthia C. S. Liem","Jacco van Ossenbruggen","Laura Hollink"],"pdf_url":"https://arxiv.org/pdf/2409.08046v1.pdf","comment":"Accepted at FAccTRec@RecSys 2024, 11 pages"},{"id":"http://arxiv.org/abs/2409.08014v1","updated":"2024-09-12T12:57:08Z","published":"2024-09-12T12:57:08Z","title":"An Evaluation Framework for Attributed Information Retrieval using Large\n Language Models","summary":" With the growing success of Large Language models (LLMs) in\ninformation-seeking scenarios, search engines are now adopting generative\napproaches to provide answers along with in-line citations as attribution.\nWhile existing work focuses mainly on attributed question answering, in this\npaper, we target information-seeking scenarios which are often more challenging\ndue to the open-ended nature of the queries and the size of the label space in\nterms of the diversity of candidate-attributed answers per query. We propose a\nreproducible framework to evaluate and benchmark attributed information\nseeking, using any backbone LLM, and different architectural designs: (1)\nGenerate (2) Retrieve then Generate, and (3) Generate then Retrieve.\nExperiments using HAGRID, an attributed information-seeking dataset, show the\nimpact of different scenarios on both the correctness and attributability of\nanswers.\n","authors":["Hanane Djeddal","Pierre Erbacher","Raouf Toukal","Laure Soulier","Karen Pinel-Sauvagnat","Sophia Katrenko","Lynda Tamine"],"pdf_url":"https://arxiv.org/pdf/2409.08014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07946v1","updated":"2024-09-12T11:14:25Z","published":"2024-09-12T11:14:25Z","title":"Collaborative Automatic Modulation Classification via Deep Edge\n Inference for Hierarchical Cognitive Radio Networks","summary":" In hierarchical cognitive radio networks, edge or cloud servers utilize the\ndata collected by edge devices for modulation classification, which, however,\nis faced with problems of the transmission overhead, data privacy, and\ncomputation load. In this article, an edge learning (EL) based framework\njointly mobilizing the edge device and the edge server for intelligent\nco-inference is proposed to realize the collaborative automatic modulation\nclassification (C-AMC) between them. A spectrum semantic compression neural\nnetwork (SSCNet) with the lightweight structure is designed for the edge device\nto compress the collected raw data into a compact semantic message that is then\nsent to the edge server via the wireless channel. On the edge server side, a\nmodulation classification neural network (MCNet) combining bidirectional long\nshort-term memory (Bi?LSTM) and multi-head attention layers is elaborated to\ndeter?mine the modulation type from the noisy semantic message. By leveraging\nthe computation resources of both the edge device and the edge server, high\ntransmission overhead and risks of data privacy leakage are avoided. The\nsimulation results verify the effectiveness of the proposed C-AMC framework,\nsignificantly reducing the model size and computational complexity.\n","authors":["Chaowei He","Peihao Dong","Fuhui Zhou","Qihui Wu"],"pdf_url":"https://arxiv.org/pdf/2409.07946v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.20772"},{"id":"http://arxiv.org/abs/2409.07850v1","updated":"2024-09-12T08:53:11Z","published":"2024-09-12T08:53:11Z","title":"Enhancing Cross-Market Recommendation System with Graph Isomorphism\n Networks: A Novel Approach to Personalized User Experience","summary":" In today's world of globalized commerce, cross-market recommendation systems\n(CMRs) are crucial for providing personalized user experiences across diverse\nmarket segments. However, traditional recommendation algorithms have\ndifficulties dealing with market specificity and data sparsity, especially in\nnew or emerging markets. In this paper, we propose the CrossGR model, which\nutilizes Graph Isomorphism Networks (GINs) to improve CMR systems. It\noutperforms existing benchmarks in NDCG@10 and HR@10 metrics, demonstrating its\nadaptability and accuracy in handling diverse market segments. The CrossGR\nmodel is adaptable and accurate, making it well-suited for handling the\ncomplexities of cross-market recommendation tasks. Its robustness is\ndemonstrated by consistent performance across different evaluation timeframes,\nindicating its potential to cater to evolving market trends and user\npreferences. Our findings suggest that GINs represent a promising direction for\nCMRs, paving the way for more sophisticated, personalized, and context-aware\nrecommendation systems in the dynamic landscape of global e-commerce.\n","authors":["Sümeyye Öztürk","Ahmed Burak Ercan","Resul Tugay","Şule Gündüz Öğüdücü"],"pdf_url":"https://arxiv.org/pdf/2409.07850v1.pdf","comment":"7 pages, 1 figure, 3 tables, 5 equations"},{"id":"http://arxiv.org/abs/2409.07773v1","updated":"2024-09-12T06:13:07Z","published":"2024-09-12T06:13:07Z","title":"PDC-FRS: Privacy-preserving Data Contribution for Federated Recommender\n System","summary":" Federated recommender systems (FedRecs) have emerged as a popular research\ndirection for protecting users' privacy in on-device recommendations. In\nFedRecs, users keep their data locally and only contribute their local\ncollaborative information by uploading model parameters to a central server.\nWhile this rigid framework protects users' raw data during training, it\nseverely compromises the recommendation model's performance due to the\nfollowing reasons: (1) Due to the power law distribution nature of user\nbehavior data, individual users have few data points to train a recommendation\nmodel, resulting in uploaded model updates that may be far from optimal; (2) As\neach user's uploaded parameters are learned from local data, which lacks global\ncollaborative information, relying solely on parameter aggregation methods such\nas FedAvg to fuse global collaborative information may be suboptimal. To bridge\nthis performance gap, we propose a novel federated recommendation framework,\nPDC-FRS. Specifically, we design a privacy-preserving data contribution\nmechanism that allows users to share their data with a differential privacy\nguarantee. Based on the shared but perturbed data, an auxiliary model is\ntrained in parallel with the original federated recommendation process. This\nauxiliary model enhances FedRec by augmenting each user's local dataset and\nintegrating global collaborative information. To demonstrate the effectiveness\nof PDC-FRS, we conduct extensive experiments on two widely used recommendation\ndatasets. The empirical results showcase the superiority of PDC-FRS compared to\nbaseline methods.\n","authors":["Chaoqun Yang","Wei Yuan","Liang Qu","Thanh Tam Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.07773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10555v2","updated":"2024-09-12T05:52:05Z","published":"2024-08-20T05:38:47Z","title":"GACL: Graph Attention Collaborative Learning for Temporal QoS Prediction","summary":" Accurate prediction of temporal QoS is crucial for maintaining service\nreliability and enhancing user satisfaction in dynamic service-oriented\nenvironments. However, current methods often neglect high-order latent\ncollaborative relationships and fail to dynamically adjust feature learning for\nspecific user-service invocations, which are critical for precise feature\nextraction within each time slice. Moreover, the prevalent use of RNNs for\nmodeling temporal feature evolution patterns is constrained by their inherent\ndifficulty in managing long-range dependencies, thereby limiting the detection\nof long-term QoS trends across multiple time slices. These shortcomings\ndramatically degrade the performance of temporal QoS prediction. To address the\ntwo issues, we propose a novel Graph Attention Collaborative Learning (GACL)\nframework for temporal QoS prediction. Building on a dynamic user-service\ninvocation graph to comprehensively model historical interactions, it designs a\ntarget-prompt graph attention network to extract deep latent features of users\nand services at each time slice, considering implicit target-neighboring\ncollaborative relationships and historical QoS values. Additionally, a\nmulti-layer Transformer encoder is introduced to uncover temporal feature\nevolution patterns, enhancing temporal QoS prediction. Extensive experiments on\nthe WS-DREAM dataset demonstrate that GACL significantly outperforms\nstate-of-the-art methods for temporal QoS prediction across multiple evaluation\nmetrics, achieving the improvements of up to 38.80%.\n","authors":["Shengxiang Hu","Guobing Zou","Bofeng Zhang","Shaogang Wu","Shiyi Lin","Yanglan Gan","Yixin Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10555v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2409.07730v1","updated":"2024-09-12T03:33:19Z","published":"2024-09-12T03:33:19Z","title":"Music auto-tagging in the long tail: A few-shot approach","summary":" In the realm of digital music, using tags to efficiently organize and\nretrieve music from extensive databases is crucial for music catalog owners.\nHuman tagging by experts is labor-intensive but mostly accurate, whereas\nautomatic tagging through supervised learning has approached satisfying\naccuracy but is restricted to a predefined set of training tags. Few-shot\nlearning offers a viable solution to expand beyond this small set of predefined\ntags by enabling models to learn from only a few human-provided examples to\nunderstand tag meanings and subsequently apply these tags autonomously. We\npropose to integrate few-shot learning methodology into multi-label music\nauto-tagging by using features from pre-trained models as inputs to a\nlightweight linear classifier, also known as a linear probe. We investigate\ndifferent popular pre-trained features, as well as different few-shot\nparametrizations with varying numbers of classes and samples per class. Our\nexperiments demonstrate that a simple model with pre-trained features can\nachieve performance close to state-of-the-art models while using significantly\nless training data, such as 20 samples per tag. Additionally, our linear probe\nperforms competitively with leading models when trained on the entire training\ndataset. The results show that this transfer learning-based few-shot approach\ncould effectively address the issue of automatically assigning long-tail tags\nwith only limited labeled data.\n","authors":["T. Aleksandra Ma","Alexander Lerch"],"pdf_url":"https://arxiv.org/pdf/2409.07730v1.pdf","comment":"Published in Audio Engineering Society NY Show 2024 as a Peer\n Reviewed (Category 1) paper"},{"id":"http://arxiv.org/abs/2409.07709v1","updated":"2024-09-12T02:25:41Z","published":"2024-09-12T02:25:41Z","title":"Harnessing TI Feeds for Exploitation Detection","summary":" Many organizations rely on Threat Intelligence (TI) feeds to assess the risk\nassociated with security threats. Due to the volume and heterogeneity of data,\nit is prohibitive to manually analyze the threat information available in\ndifferent loosely structured TI feeds. Thus, there is a need to develop\nautomated methods to vet and extract actionable information from TI feeds. To\nthis end, we present a machine learning pipeline to automatically detect\nvulnerability exploitation from TI feeds. We first model threat vocabulary in\nloosely structured TI feeds using state-of-the-art embedding techniques\n(Doc2Vec and BERT) and then use it to train a supervised machine learning\nclassifier to detect exploitation of security vulnerabilities. We use our\napproach to identify exploitation events in 191 different TI feeds. Our\nlongitudinal evaluation shows that it is able to accurately identify\nexploitation events from TI feeds only using past data for training and even on\nTI feeds withheld from training. Our proposed approach is useful for a variety\nof downstream tasks such as data-driven vulnerability risk assessment.\n","authors":["Kajal Patel","Zubair Shafiq","Mateus Nogueira","Daniel Sadoc Menasché","Enrico Lovat","Taimur Kashif","Ashton Woiwood","Matheus Martins"],"pdf_url":"https://arxiv.org/pdf/2409.07709v1.pdf","comment":"This paper appears at IEEE International Conference on Cyber Security\n and Resilience (IEEE CSR 2024)"},{"id":"http://arxiv.org/abs/2409.07691v1","updated":"2024-09-12T01:51:06Z","published":"2024-09-12T01:51:06Z","title":"Enhancing Q&A Text Retrieval with Ranking Models: Benchmarking,\n fine-tuning and deploying Rerankers for RAG","summary":" Ranking models play a crucial role in enhancing overall accuracy of text\nretrieval systems. These multi-stage systems typically utilize either dense\nembedding models or sparse lexical indices to retrieve relevant passages based\non a given query, followed by ranking models that refine the ordering of the\ncandidate passages by its relevance to the query.\n This paper benchmarks various publicly available ranking models and examines\ntheir impact on ranking accuracy. We focus on text retrieval for\nquestion-answering tasks, a common use case for Retrieval-Augmented Generation\nsystems. Our evaluation benchmarks include models some of which are\ncommercially viable for industrial applications.\n We introduce a state-of-the-art ranking model, NV-RerankQA-Mistral-4B-v3,\nwhich achieves a significant accuracy increase of ~14% compared to pipelines\nwith other rerankers. We also provide an ablation study comparing the\nfine-tuning of ranking models with different sizes, losses and self-attention\nmechanisms.\n Finally, we discuss challenges of text retrieval pipelines with ranking\nmodels in real-world industry applications, in particular the trade-offs among\nmodel size, ranking accuracy and system requirements like indexing and serving\nlatency / throughput.\n","authors":["Gabriel de Souza P. Moreira","Ronay Ak","Benedikt Schifferer","Mengyao Xu","Radek Osmulski","Even Oldridge"],"pdf_url":"https://arxiv.org/pdf/2409.07691v1.pdf","comment":"Accepted for the 1st Workshop on GenAI and RAG Systems for Enterprise\n @ CIKM 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.08838v2","updated":"2024-09-12T23:09:04Z","published":"2024-07-11T19:47:37Z","title":"Deep Learning for Network Anomaly Detection under Data Contamination:\n Evaluating Robustness and Mitigating Performance Degradation","summary":" Deep learning (DL) has emerged as a crucial tool in network anomaly detection\n(NAD) for cybersecurity. While DL models for anomaly detection excel at\nextracting features and learning patterns from data, they are vulnerable to\ndata contamination -- the inadvertent inclusion of attack-related data in\ntraining sets presumed benign. This study evaluates the robustness of six\nunsupervised DL algorithms against data contamination using our proposed\nevaluation protocol. Results demonstrate significant performance degradation in\nstate-of-the-art anomaly detection algorithms when exposed to contaminated\ndata, highlighting the critical need for self-protection mechanisms in DL-based\nNAD models. To mitigate this vulnerability, we propose an enhanced auto-encoder\nwith a constrained latent representation, allowing normal data to cluster more\ndensely around a learnable center in the latent space. Our evaluation reveals\nthat this approach exhibits improved resistance to data contamination compared\nto existing methods, offering a promising direction for more robust NAD\nsystems.\n","authors":["D'Jeff K. Nkashama","Jordan Masakuna Félicien","Arian Soltani","Jean-Charles Verdier","Pierre-Martin Tardif","Marc Frappier","Froduald Kabanza"],"pdf_url":"https://arxiv.org/pdf/2407.08838v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2207.03576"},{"id":"http://arxiv.org/abs/2409.06613v2","updated":"2024-09-12T23:01:59Z","published":"2024-09-10T16:05:25Z","title":"DemoStart: Demonstration-led auto-curriculum applied to sim-to-real with\n multi-fingered robots","summary":" We present DemoStart, a novel auto-curriculum reinforcement learning method\ncapable of learning complex manipulation behaviors on an arm equipped with a\nthree-fingered robotic hand, from only a sparse reward and a handful of\ndemonstrations in simulation. Learning from simulation drastically reduces the\ndevelopment cycle of behavior generation, and domain randomization techniques\nare leveraged to achieve successful zero-shot sim-to-real transfer. Transferred\npolicies are learned directly from raw pixels from multiple cameras and robot\nproprioception. Our approach outperforms policies learned from demonstrations\non the real robot and requires 100 times fewer demonstrations, collected in\nsimulation. More details and videos in https://sites.google.com/view/demostart.\n","authors":["Maria Bauza","Jose Enrique Chen","Valentin Dalibard","Nimrod Gileadi","Roland Hafner","Murilo F. Martins","Joss Moore","Rugile Pevceviciute","Antoine Laurens","Dushyant Rao","Martina Zambelli","Martin Riedmiller","Jon Scholz","Konstantinos Bousmalis","Francesco Nori","Nicolas Heess"],"pdf_url":"https://arxiv.org/pdf/2409.06613v2.pdf","comment":"15 pages total with 7 pages of appendix. 9 Figures, 4 in the main\n text and 5 in the appendix"},{"id":"http://arxiv.org/abs/2409.08422v1","updated":"2024-09-12T22:51:08Z","published":"2024-09-12T22:51:08Z","title":"Fitted Q-Iteration via Max-Plus-Linear Approximation","summary":" In this study, we consider the application of max-plus-linear approximators\nfor Q-function in offline reinforcement learning of discounted Markov decision\nprocesses. In particular, we incorporate these approximators to propose novel\nfitted Q-iteration (FQI) algorithms with provable convergence. Exploiting the\ncompatibility of the Bellman operator with max-plus operations, we show that\nthe max-plus-linear regression within each iteration of the proposed FQI\nalgorithm reduces to simple max-plus matrix-vector multiplications. We also\nconsider the variational implementation of the proposed algorithm which leads\nto a per-iteration complexity that is independent of the number of samples.\n","authors":["Y. Liu","M. A. S. Kolarijani"],"pdf_url":"https://arxiv.org/pdf/2409.08422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08419v1","updated":"2024-09-12T22:45:10Z","published":"2024-09-12T22:45:10Z","title":"Introducing CausalBench: A Flexible Benchmark Framework for Causal\n Analysis and Machine Learning","summary":" While witnessing the exceptional success of machine learning (ML)\ntechnologies in many applications, users are starting to notice a critical\nshortcoming of ML: correlation is a poor substitute for causation. The\nconventional way to discover causal relationships is to use randomized\ncontrolled experiments (RCT); in many situations, however, these are\nimpractical or sometimes unethical. Causal learning from observational data\noffers a promising alternative. While being relatively recent, causal learning\naims to go far beyond conventional machine learning, yet several major\nchallenges remain. Unfortunately, advances are hampered due to the lack of\nunified benchmark datasets, algorithms, metrics, and evaluation service\ninterfaces for causal learning. In this paper, we introduce {\\em CausalBench},\na transparent, fair, and easy-to-use evaluation platform, aiming to (a) enable\nthe advancement of research in causal learning by facilitating scientific\ncollaboration in novel algorithms, datasets, and metrics and (b) promote\nscientific objectivity, reproducibility, fairness, and awareness of bias in\ncausal learning research. CausalBench provides services for benchmarking data,\nalgorithms, models, and metrics, impacting the needs of a broad of scientific\nand engineering disciplines.\n","authors":["Ahmet Kapkiç","Pratanu Mandal","Shu Wan","Paras Sheth","Abhinav Gorantla","Yoonhyuk Choi","Huan Liu","K. Selçuk Candan"],"pdf_url":"https://arxiv.org/pdf/2409.08419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02764v2","updated":"2024-09-12T22:18:03Z","published":"2024-05-04T22:00:28Z","title":"Assessing Adversarial Robustness of Large Language Models: An Empirical\n Study","summary":" Large Language Models (LLMs) have revolutionized natural language processing,\nbut their robustness against adversarial attacks remains a critical concern. We\npresents a novel white-box style attack approach that exposes vulnerabilities\nin leading open-source LLMs, including Llama, OPT, and T5. We assess the impact\nof model size, structure, and fine-tuning strategies on their resistance to\nadversarial perturbations. Our comprehensive evaluation across five diverse\ntext classification tasks establishes a new benchmark for LLM robustness. The\nfindings of this study have far-reaching implications for the reliable\ndeployment of LLMs in real-world applications and contribute to the advancement\nof trustworthy AI systems.\n","authors":["Zeyu Yang","Zhao Meng","Xiaochen Zheng","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2405.02764v2.pdf","comment":"Oral presentation at KDD 2024 GenAI Evaluation workshop"},{"id":"http://arxiv.org/abs/2409.08409v1","updated":"2024-09-12T21:40:04Z","published":"2024-09-12T21:40:04Z","title":"Wasserstein Distributionally Robust Multiclass Support Vector Machine","summary":" We study the problem of multiclass classification for settings where data\nfeatures $\\mathbf{x}$ and their labels $\\mathbf{y}$ are uncertain. We identify\nthat distributionally robust one-vs-all (OVA) classifiers often struggle in\nsettings with imbalanced data. To address this issue, we use Wasserstein\ndistributionally robust optimization to develop a robust version of the\nmulticlass support vector machine (SVM) characterized by the Crammer-Singer\n(CS) loss. First, we prove that the CS loss is bounded from above by a\nLipschitz continuous function for all $\\mathbf{x} \\in \\mathcal{X}$ and\n$\\mathbf{y} \\in \\mathcal{Y}$, then we exploit strong duality results to express\nthe dual of the worst-case risk problem, and we show that the worst-case risk\nminimization problem admits a tractable convex reformulation due to the\nregularity of the CS loss. Moreover, we develop a kernel version of our\nproposed model to account for nonlinear class separation, and we show that it\nadmits a tractable convex upper bound. We also propose a projected subgradient\nmethod algorithm for a special case of our proposed linear model to improve\nscalability. Our numerical experiments demonstrate that our model outperforms\nstate-of-the art OVA models in settings where the training data is highly\nimbalanced. We also show through experiments on popular real-world datasets\nthat our proposed model often outperforms its regularized counterpart as the\nfirst accounts for uncertain labels unlike the latter.\n","authors":["Michael Ibrahim","Heraldo Rozas","Nagi Gebraeel"],"pdf_url":"https://arxiv.org/pdf/2409.08409v1.pdf","comment":"26 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.08400v1","updated":"2024-09-12T21:12:21Z","published":"2024-09-12T21:12:21Z","title":"Scores as Actions: a framework of fine-tuning diffusion models by\n continuous-time reinforcement learning","summary":" Reinforcement Learning from human feedback (RLHF) has been shown a promising\ndirection for aligning generative models with human intent and has also been\nexplored in recent works for alignment of diffusion generative models. In this\nwork, we provide a rigorous treatment by formulating the task of fine-tuning\ndiffusion models, with reward functions learned from human feedback, as an\nexploratory continuous-time stochastic control problem. Our key idea lies in\ntreating the score-matching functions as controls/actions, and upon this, we\ndevelop a unified framework from a continuous-time perspective, to employ\nreinforcement learning (RL) algorithms in terms of improving the generation\nquality of diffusion models. We also develop the corresponding continuous-time\nRL theory for policy optimization and regularization under assumptions of\nstochastic different equations driven environment. Experiments on the\ntext-to-image (T2I) generation will be reported in the accompanied paper.\n","authors":["Hanyang Zhao","Haoxian Chen","Ji Zhang","David D. Yao","Wenpin Tang"],"pdf_url":"https://arxiv.org/pdf/2409.08400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04913v2","updated":"2024-09-12T21:04:20Z","published":"2024-09-07T21:27:49Z","title":"NGD converges to less degenerate solutions than SGD","summary":" The number of free parameters, or dimension, of a model is a straightforward\nway to measure its complexity: a model with more parameters can encode more\ninformation. However, this is not an accurate measure of complexity: models\ncapable of memorizing their training data often generalize well despite their\nhigh dimension. Effective dimension aims to more directly capture the\ncomplexity of a model by counting only the number of parameters required to\nrepresent the functionality of the model. Singular learning theory (SLT)\nproposes the learning coefficient $ \\lambda $ as a more accurate measure of\neffective dimension. By describing the rate of increase of the volume of the\nregion of parameter space around a local minimum with respect to loss, $\n\\lambda $ incorporates information from higher-order terms. We compare $\n\\lambda $ of models trained using natural gradient descent (NGD) and stochastic\ngradient descent (SGD), and find that those trained with NGD consistently have\na higher effective dimension for both of our methods: the Hessian trace $\n\\text{Tr}(\\mathbf{H}) $, and the estimate of the local learning coefficient\n(LLC) $ \\hat{\\lambda}(w^*) $.\n","authors":["Moosa Saghir","N. R. Raghavendra","Zihe Liu","Evan Ryan Gunter"],"pdf_url":"https://arxiv.org/pdf/2409.04913v2.pdf","comment":"8 pages, 23 figures"},{"id":"http://arxiv.org/abs/2407.09753v2","updated":"2024-09-12T20:57:17Z","published":"2024-07-13T03:09:22Z","title":"Biased Backpressure Routing Using Link Features and Graph Neural\n Networks","summary":" To reduce the latency of Backpressure (BP) routing in wireless multi-hop\nnetworks, we propose to enhance the existing shortest path-biased BP (SP-BP)\nand sojourn time-based backlog metrics, since they introduce no additional time\nstep-wise signaling overhead to the basic BP. Rather than relying on\nhop-distance, we introduce a new edge-weighted shortest path bias built on the\nscheduling duty cycle of wireless links, which can be predicted by a graph\nconvolutional neural network based on the topology and traffic of wireless\nnetworks. Additionally, we tackle three long-standing challenges associated\nwith SP-BP: optimal bias scaling, efficient bias maintenance, and integration\nof delay awareness. Our proposed solutions inherit the throughput optimality of\nthe basic BP, as well as its practical advantages of low complexity and fully\ndistributed implementation. Our approaches rely on common link features and\nintroduces only a one-time constant overhead to previous SP-BP schemes, or a\none-time overhead linear in the network size to the basic BP. Numerical\nexperiments show that our solutions can effectively address the major drawbacks\nof slow startup, random walk, and the last packet problem in basic BP,\nimproving the end-to-end delay of existing low-overhead BP algorithms under\nvarious settings of network traffic, interference, and mobility.\n","authors":["Zhongyuan Zhao","Bojan Radojičić","Gunjan Verma","Ananthram Swami","Santiago Segarra"],"pdf_url":"https://arxiv.org/pdf/2407.09753v2.pdf","comment":"16 pages, 15 figures, accepted for publication in IEEE Transactions\n on Machine Learning in Communications and Networking. arXiv admin note: text\n overlap with arXiv:2310.04364, arXiv:2211.10748"},{"id":"http://arxiv.org/abs/2409.08396v1","updated":"2024-09-12T20:55:21Z","published":"2024-09-12T20:55:21Z","title":"Federated One-Shot Ensemble Clustering","summary":" Cluster analysis across multiple institutions poses significant challenges\ndue to data-sharing restrictions. To overcome these limitations, we introduce\nthe Federated One-shot Ensemble Clustering (FONT) algorithm, a novel solution\ntailored for multi-site analyses under such constraints. FONT requires only a\nsingle round of communication between sites and ensures privacy by exchanging\nonly fitted model parameters and class labels. The algorithm combines locally\nfitted clustering models into a data-adaptive ensemble, making it broadly\napplicable to various clustering techniques and robust to differences in\ncluster proportions across sites. Our theoretical analysis validates the\neffectiveness of the data-adaptive weights learned by FONT, and simulation\nstudies demonstrate its superior performance compared to existing benchmark\nmethods. We applied FONT to identify subgroups of patients with rheumatoid\narthritis across two health systems, revealing improved consistency of patient\nclusters across sites, while locally fitted clusters proved less transferable.\nFONT is particularly well-suited for real-world applications with stringent\ncommunication and privacy constraints, offering a scalable and practical\nsolution for multi-site clustering.\n","authors":["Rui Duan","Xin Xiong","Jueyi Liu","Katherine P. Liao","Tianxi Cai"],"pdf_url":"https://arxiv.org/pdf/2409.08396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08395v1","updated":"2024-09-12T20:48:28Z","published":"2024-09-12T20:48:28Z","title":"Graphical Structural Learning of rs-fMRI data in Heavy Smokers","summary":" Recent studies revealed structural and functional brain changes in heavy\nsmokers. However, the specific changes in topological brain connections are not\nwell understood. We used Gaussian Undirected Graphs with the graphical lasso\nalgorithm on rs-fMRI data from smokers and non-smokers to identify significant\nchanges in brain connections. Our results indicate high stability in the\nestimated graphs and identify several brain regions significantly affected by\nsmoking, providing valuable insights for future clinical research.\n","authors":["Yiru Gong","Qimin Zhang","Huili Zhen","Zheyan Liu","Shaohan Chen"],"pdf_url":"https://arxiv.org/pdf/2409.08395v1.pdf","comment":"Accepted by IEEE CCSB 2024 conference"},{"id":"http://arxiv.org/abs/2409.08389v1","updated":"2024-09-12T20:37:14Z","published":"2024-09-12T20:37:14Z","title":"Higher-Order Topological Directionality and Directed Simplicial Neural\n Networks","summary":" Topological Deep Learning (TDL) has emerged as a paradigm to process and\nlearn from signals defined on higher-order combinatorial topological spaces,\nsuch as simplicial or cell complexes. Although many complex systems have an\nasymmetric relational structure, most TDL models forcibly symmetrize these\nrelationships. In this paper, we first introduce a novel notion of higher-order\ndirectionality and we then design Directed Simplicial Neural Networks\n(Dir-SNNs) based on it. Dir-SNNs are message-passing networks operating on\ndirected simplicial complexes able to leverage directed and possibly asymmetric\ninteractions among the simplices. To our knowledge, this is the first TDL model\nusing a notion of higher-order directionality. We theoretically and empirically\nprove that Dir-SNNs are more expressive than their directed graph counterpart\nin distinguishing isomorphic directed graphs. Experiments on a synthetic source\nlocalization task demonstrate that Dir-SNNs outperform undirected SNNs when the\nunderlying complex is directed, and perform comparably when the underlying\ncomplex is undirected.\n","authors":["Manuel Lecha","Andrea Cavallo","Francesca Dominici","Elvin Isufi","Claudio Battiloro"],"pdf_url":"https://arxiv.org/pdf/2409.08389v1.pdf","comment":"7 pages, 8 figures, 1 table"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.08381v1","updated":"2024-09-12T20:02:51Z","published":"2024-09-12T20:02:51Z","title":"Rethinking Prompting Strategies for Multi-Label Recognition with Partial\n Annotations","summary":" Vision-language models (VLMs) like CLIP have been adapted for Multi-Label\nRecognition (MLR) with partial annotations by leveraging prompt-learning, where\npositive and negative prompts are learned for each class to associate their\nembeddings with class presence or absence in the shared vision-text feature\nspace. While this approach improves MLR performance by relying on VLM priors,\nwe hypothesize that learning negative prompts may be suboptimal, as the\ndatasets used to train VLMs lack image-caption pairs explicitly focusing on\nclass absence. To analyze the impact of positive and negative prompt learning\non MLR, we introduce PositiveCoOp and NegativeCoOp, where only one prompt is\nlearned with VLM guidance while the other is replaced by an embedding vector\nlearned directly in the shared feature space without relying on the text\nencoder. Through empirical analysis, we observe that negative prompts degrade\nMLR performance, and learning only positive prompts, combined with learned\nnegative embeddings (PositiveCoOp), outperforms dual prompt learning\napproaches. Moreover, we quantify the performance benefits that prompt-learning\noffers over a simple vision-features-only baseline, observing that the baseline\ndisplays strong performance comparable to dual prompt learning approach\n(DualCoOp), when the proportion of missing labels is low, while requiring half\nthe training compute and 16 times fewer parameters\n","authors":["Samyak Rawlekar","Shubhang Bhatnagar","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2409.08381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08270v1","updated":"2024-09-12T17:58:13Z","published":"2024-09-12T17:58:13Z","title":"FlashSplat: 2D to 3D Gaussian Splatting Segmentation Solved Optimally","summary":" This study addresses the challenge of accurately segmenting 3D Gaussian\nSplatting from 2D masks. Conventional methods often rely on iterative gradient\ndescent to assign each Gaussian a unique label, leading to lengthy optimization\nand sub-optimal solutions. Instead, we propose a straightforward yet globally\noptimal solver for 3D-GS segmentation. The core insight of our method is that,\nwith a reconstructed 3D-GS scene, the rendering of the 2D masks is essentially\na linear function with respect to the labels of each Gaussian. As such, the\noptimal label assignment can be solved via linear programming in closed form.\nThis solution capitalizes on the alpha blending characteristic of the splatting\nprocess for single step optimization. By incorporating the background bias in\nour objective function, our method shows superior robustness in 3D segmentation\nagainst noises. Remarkably, our optimization completes within 30 seconds, about\n50$\\times$ faster than the best existing methods. Extensive experiments\ndemonstrate the efficiency and robustness of our method in segmenting various\nscenes, and its superior performance in downstream tasks such as object removal\nand inpainting. Demos and code will be available at\nhttps://github.com/florinshen/FlashSplat.\n","authors":["Qiuhong Shen","Xingyi Yang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08270v1.pdf","comment":"ECCV'2024"},{"id":"http://arxiv.org/abs/2409.08260v1","updated":"2024-09-12T17:55:37Z","published":"2024-09-12T17:55:37Z","title":"Improving Text-guided Object Inpainting with Semantic Pre-inpainting","summary":" Recent years have witnessed the success of large text-to-image diffusion\nmodels and their remarkable potential to generate high-quality images. The\nfurther pursuit of enhancing the editability of images has sparked significant\ninterest in the downstream task of inpainting a novel object described by a\ntext prompt within a designated region in the image. Nevertheless, the problem\nis not trivial from two aspects: 1) Solely relying on one single U-Net to align\ntext prompt and visual object across all the denoising timesteps is\ninsufficient to generate desired objects; 2) The controllability of object\ngeneration is not guaranteed in the intricate sampling space of diffusion\nmodel. In this paper, we propose to decompose the typical single-stage object\ninpainting into two cascaded processes: 1) semantic pre-inpainting that infers\nthe semantic features of desired objects in a multi-modal feature space; 2)\nhigh-fieldity object generation in diffusion latent space that pivots on such\ninpainted semantic features. To achieve this, we cascade a Transformer-based\nsemantic inpainter and an object inpainting diffusion model, leading to a novel\nCAscaded Transformer-Diffusion (CAT-Diffusion) framework for text-guided object\ninpainting. Technically, the semantic inpainter is trained to predict the\nsemantic features of the target object conditioning on unmasked context and\ntext prompt. The outputs of the semantic inpainter then act as the informative\nvisual prompts to guide high-fieldity object generation through a reference\nadapter layer, leading to controllable object inpainting. Extensive evaluations\non OpenImages-V6 and MSCOCO validate the superiority of CAT-Diffusion against\nthe state-of-the-art methods. Code is available at\n\\url{https://github.com/Nnn-s/CATdiffusion}.\n","authors":["Yifu Chen","Jingwen Chen","Yingwei Pan","Yehao Li","Ting Yao","Zhineng Chen","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2409.08260v1.pdf","comment":"ECCV 2024. Source code is available at\n https://github.com/Nnn-s/CATdiffusion"},{"id":"http://arxiv.org/abs/2409.08258v1","updated":"2024-09-12T17:55:11Z","published":"2024-09-12T17:55:11Z","title":"Improving Virtual Try-On with Garment-focused Diffusion Models","summary":" Diffusion models have led to the revolutionizing of generative modeling in\nnumerous image synthesis tasks. Nevertheless, it is not trivial to directly\napply diffusion models for synthesizing an image of a target person wearing a\ngiven in-shop garment, i.e., image-based virtual try-on (VTON) task. The\ndifficulty originates from the aspect that the diffusion process should not\nonly produce holistically high-fidelity photorealistic image of the target\nperson, but also locally preserve every appearance and texture detail of the\ngiven garment. To address this, we shape a new Diffusion model, namely GarDiff,\nwhich triggers the garment-focused diffusion process with amplified guidance of\nboth basic visual appearance and detailed textures (i.e., high-frequency\ndetails) derived from the given garment. GarDiff first remoulds a pre-trained\nlatent diffusion model with additional appearance priors derived from the CLIP\nand VAE encodings of the reference garment. Meanwhile, a novel garment-focused\nadapter is integrated into the UNet of diffusion model, pursuing local\nfine-grained alignment with the visual appearance of reference garment and\nhuman pose. We specifically design an appearance loss over the synthesized\ngarment to enhance the crucial, high-frequency details. Extensive experiments\non VITON-HD and DressCode datasets demonstrate the superiority of our GarDiff\nwhen compared to state-of-the-art VTON approaches. Code is publicly available\nat:\n\\href{https://github.com/siqi0905/GarDiff/tree/master}{https://github.com/siqi0905/GarDiff/tree/master}.\n","authors":["Siqi Wan","Yehao Li","Jingwen Chen","Yingwei Pan","Ting Yao","Yang Cao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2409.08258v1.pdf","comment":"ECCV 2024. Source code is available at\n https://github.com/siqi0905/GarDiff/tree/master"},{"id":"http://arxiv.org/abs/2409.08206v1","updated":"2024-09-12T16:46:41Z","published":"2024-09-12T16:46:41Z","title":"ComAlign: Compositional Alignment in Vision-Language Models","summary":" Vision-language models (VLMs) like CLIP have showcased a remarkable ability\nto extract transferable features for downstream tasks. Nonetheless, the\ntraining process of these models is usually based on a coarse-grained\ncontrastive loss between the global embedding of images and texts which may\nlose the compositional structure of these modalities. Many recent studies have\nshown VLMs lack compositional understandings like attribute binding and\nidentifying object relationships. Although some recent methods have tried to\nachieve finer-level alignments, they either are not based on extracting\nmeaningful components of proper granularity or don't properly utilize the\nmodalities' correspondence (especially in image-text pairs with more\ningredients). Addressing these limitations, we introduce Compositional\nAlignment (ComAlign), a fine-grained approach to discover more exact\ncorrespondence of text and image components using only the weak supervision in\nthe form of image-text pairs. Our methodology emphasizes that the compositional\nstructure (including entities and relations) extracted from the text modality\nmust also be retained in the image modality. To enforce correspondence of\nfine-grained concepts in image and text modalities, we train a lightweight\nnetwork lying on top of existing visual and language encoders using a small\ndataset. The network is trained to align nodes and edges of the structure\nacross the modalities. Experimental results on various VLMs and datasets\ndemonstrate significant improvements in retrieval and compositional benchmarks,\naffirming the effectiveness of our plugin model.\n","authors":["Ali Abdollah","Amirmohammad Izadi","Armin Saghafian","Reza Vahidimajd","Mohammad Mozafari","Amirreza Mirzaei","Mohammadmahdi Samiei","Mahdieh Soleymani Baghshah"],"pdf_url":"https://arxiv.org/pdf/2409.08206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21757v2","updated":"2024-09-12T14:01:56Z","published":"2024-07-31T17:23:57Z","title":"Learning Video Context as Interleaved Multimodal Sequences","summary":" Narrative videos, such as movies, pose significant challenges in video\nunderstanding due to their rich contexts (characters, dialogues, storylines)\nand diverse demands (identify who, relationship, and reason). In this paper, we\nintroduce MovieSeq, a multimodal language model developed to address the wide\nrange of challenges in understanding video contexts. Our core idea is to\nrepresent videos as interleaved multimodal sequences (including images, plots,\nvideos, and subtitles), either by linking external knowledge databases or using\noffline models (such as whisper for subtitles). Through instruction-tuning,\nthis approach empowers the language model to interact with videos using\ninterleaved multimodal instructions. For example, instead of solely relying on\nvideo as input, we jointly provide character photos alongside their names and\ndialogues, allowing the model to associate these elements and generate more\ncomprehensive responses. To demonstrate its effectiveness, we validate\nMovieSeq's performance on six datasets (LVU, MAD, Movienet, CMD, TVC, MovieQA)\nacross five settings (video classification, audio description, video-text\nretrieval, video captioning, and video question-answering). The code will be\npublic at https://github.com/showlab/MovieSeq.\n","authors":["Kevin Qinghong Lin","Pengchuan Zhang","Difei Gao","Xide Xia","Joya Chen","Ziteng Gao","Jinheng Xie","Xuhong Xiao","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2407.21757v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2403.05192v2","updated":"2024-09-12T11:46:32Z","published":"2024-03-08T10:14:32Z","title":"An End-to-End Pipeline Perspective on Video Streaming in Best-Effort\n Networks: A Survey and Tutorial","summary":" Remaining a dominant force in Internet traffic, video streaming captivates\nend users, service providers, and researchers. This paper takes a pragmatic\napproach to reviewing recent advances in the field by focusing on the prevalent\nstreaming paradigm that involves delivering long-form two-dimensional videos\nover the best-effort Internet with client-side adaptive bitrate (ABR)\nalgorithms and assistance from content delivery networks (CDNs). To enhance\naccessibility, we supplement the survey with tutorial material. Unlike existing\nsurveys that offer fragmented views, our work provides a holistic perspective\non the entire end-to-end streaming pipeline, from video capture by a\ncamera-equipped device to playback by the end user. Our novel perspective\ncovers the ingestion, processing, and distribution stages of the pipeline and\naddresses key challenges such as video compression, upload, transcoding, ABR\nalgorithms, CDN support, and quality of experience. We review over 200 papers\nand classify streaming designs by their problem-solving methodology, whether\nbased on intuition (simple heuristics), theory (formal optimization), or\nmachine learning (generalizable data patterns). The survey further refines\nthese methodology-based categories and characterizes each design by additional\ntraits such as compatible codecs and use of super resolution. We connect the\nreviewed research to real-world applications by discussing the practices of\ncommercial streaming platforms. Finally, the survey highlights prominent\ncurrent trends and outlines future directions in video streaming.\n","authors":["Leonardo Peroni","Sergey Gorinsky"],"pdf_url":"https://arxiv.org/pdf/2403.05192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07901v1","updated":"2024-09-12T10:10:22Z","published":"2024-09-12T10:10:22Z","title":"Bridging Discrete and Continuous: A Multimodal Strategy for Complex\n Emotion Detection","summary":" In the domain of human-computer interaction, accurately recognizing and\ninterpreting human emotions is crucial yet challenging due to the complexity\nand subtlety of emotional expressions. This study explores the potential for\ndetecting a rich and flexible range of emotions through a multimodal approach\nwhich integrates facial expressions, voice tones, and transcript from video\nclips. We propose a novel framework that maps variety of emotions in a\nthree-dimensional Valence-Arousal-Dominance (VAD) space, which could reflect\nthe fluctuations and positivity/negativity of emotions to enable a more variety\nand comprehensive representation of emotional states. We employed K-means\nclustering to transit emotions from traditional discrete categorization to a\ncontinuous labeling system and built a classifier for emotion recognition upon\nthis system. The effectiveness of the proposed model is evaluated using the\nMER2024 dataset, which contains culturally consistent video clips from Chinese\nmovies and TV series, annotated with both discrete and open-vocabulary emotion\nlabels. Our experiment successfully achieved the transformation between\ndiscrete and continuous models, and the proposed model generated a more diverse\nand comprehensive set of emotion vocabulary while maintaining strong accuracy.\n","authors":["Jiehui Jia","Huan Zhang","Jinhua Liang"],"pdf_url":"https://arxiv.org/pdf/2409.07901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07855v1","updated":"2024-09-12T09:00:56Z","published":"2024-09-12T09:00:56Z","title":"MSMF: Multi-Scale Multi-Modal Fusion for Enhanced Stock Market\n Prediction","summary":" This paper presents MSMF (Multi-Scale Multi-Modal Fusion), a novel approach\nfor enhanced stock market prediction. MSMF addresses key challenges in\nmulti-modal stock analysis by integrating a modality completion encoder,\nmulti-scale feature extraction, and an innovative fusion mechanism. Our model\nleverages blank learning and progressive fusion to balance complementarity and\nredundancy across modalities, while multi-scale alignment facilitates direct\ncorrelations between heterogeneous data types. We introduce Multi-Granularity\nGates and a specialized architecture to optimize the integration of local and\nglobal information for different tasks. Additionally, a Task-targeted\nPrediction layer is employed to preserve both coarse and fine-grained features\nduring fusion. Experimental results demonstrate that MSMF outperforms existing\nmethods, achieving significant improvements in accuracy and reducing prediction\nerrors across various stock market forecasting tasks. This research contributes\nvaluable insights to the field of multi-modal financial analysis and offers a\nrobust framework for enhanced market prediction.\n","authors":["Jiahao Qin"],"pdf_url":"https://arxiv.org/pdf/2409.07855v1.pdf","comment":"15 pages, 1 figures, 7 tables"},{"id":"http://arxiv.org/abs/2409.07827v1","updated":"2024-09-12T08:19:25Z","published":"2024-09-12T08:19:25Z","title":"Bridging Paintings and Music -- Exploring Emotion based Music Generation\n through Paintings","summary":" Rapid advancements in artificial intelligence have significantly enhanced\ngenerative tasks involving music and images, employing both unimodal and\nmultimodal approaches. This research develops a model capable of generating\nmusic that resonates with the emotions depicted in visual arts, integrating\nemotion labeling, image captioning, and language models to transform visual\ninputs into musical compositions. Addressing the scarcity of aligned art and\nmusic data, we curated the Emotion Painting Music Dataset, pairing paintings\nwith corresponding music for effective training and evaluation. Our dual-stage\nframework converts images to text descriptions of emotional content and then\ntransforms these descriptions into music, facilitating efficient learning with\nminimal data. Performance is evaluated using metrics such as Fr\\'echet Audio\nDistance (FAD), Total Harmonic Distortion (THD), Inception Score (IS), and KL\ndivergence, with audio-emotion text similarity confirmed by the pre-trained\nCLAP model to demonstrate high alignment between generated music and text. This\nsynthesis tool bridges visual art and music, enhancing accessibility for the\nvisually impaired and opening avenues in educational and therapeutic\napplications by providing enriched multi-sensory experiences.\n","authors":["Tanisha Hisariya","Huan Zhang","Jinhua Liang"],"pdf_url":"https://arxiv.org/pdf/2409.07827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07759v1","updated":"2024-09-12T05:33:15Z","published":"2024-09-12T05:33:15Z","title":"SwinGS: Sliding Window Gaussian Splatting for Volumetric Video Streaming\n with Arbitrary Length","summary":" Recent advances in 3D Gaussian Splatting (3DGS) have garnered significant\nattention in computer vision and computer graphics due to its high rendering\nspeed and remarkable quality. While extant research has endeavored to extend\nthe application of 3DGS from static to dynamic scenes, such efforts have been\nconsistently impeded by excessive model sizes, constraints on video duration,\nand content deviation. These limitations significantly compromise the\nstreamability of dynamic 3D Gaussian models, thereby restricting their utility\nin downstream applications, including volumetric video, autonomous vehicle, and\nimmersive technologies such as virtual, augmented, and mixed reality.\n This paper introduces SwinGS, a novel framework for training, delivering, and\nrendering volumetric video in a real-time streaming fashion. To address the\naforementioned challenges and enhance streamability, SwinGS integrates\nspacetime Gaussian with Markov Chain Monte Carlo (MCMC) to adapt the model to\nfit various 3D scenes across frames, in the meantime employing a sliding window\ncaptures Gaussian snapshots for each frame in an accumulative way. We implement\na prototype of SwinGS and demonstrate its streamability across various datasets\nand scenes. Additionally, we develop an interactive WebGL viewer enabling\nreal-time volumetric video playback on most devices with modern browsers,\nincluding smartphones and tablets. Experimental results show that SwinGS\nreduces transmission costs by 83.6% compared to previous work with ignorable\ncompromise in PSNR. Moreover, SwinGS easily scales to long video sequences\nwithout compromising quality.\n","authors":["Bangya Liu","Suman Banerjee"],"pdf_url":"https://arxiv.org/pdf/2409.07759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07701v1","updated":"2024-09-12T02:04:26Z","published":"2024-09-12T02:04:26Z","title":"TMFNet: Two-Stream Multi-Channels Fusion Networks for Color Image\n Operation Chain Detection","summary":" Image operation chain detection techniques have gained increasing attention\nrecently in the field of multimedia forensics. However, existing detection\nmethods suffer from the generalization problem. Moreover, the channel\ncorrelation of color images that provides additional forensic evidence is often\nignored. To solve these issues, in this article, we propose a novel two-stream\nmulti-channels fusion networks for color image operation chain detection in\nwhich the spatial artifact stream and the noise residual stream are explored in\na complementary manner. Specifically, we first propose a novel deep residual\narchitecture without pooling in the spatial artifact stream for learning the\nglobal features representation of multi-channel correlation. Then, a set of\nfilters is designed to aggregate the correlation information of multi-channels\nwhile capturing the low-level features in the noise residual stream.\nSubsequently, the high-level features are extracted by the deep residual model.\nFinally, features from the two streams are fed into a fusion module, to\neffectively learn richer discriminative representations of the operation chain.\nExtensive experiments show that the proposed method achieves state-of-the-art\ngeneralization ability while maintaining robustness to JPEG compression. The\nsource code used in these experiments will be released at\nhttps://github.com/LeiTan-98/TMFNet.\n","authors":["Yakun Niu","Lei Tan","Lei Zhang","Xianyu Zuo"],"pdf_url":"https://arxiv.org/pdf/2409.07701v1.pdf","comment":"15 pages, 12 figures"}]},"2024-09-11T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.07641v1","updated":"2024-09-11T21:53:20Z","published":"2024-09-11T21:53:20Z","title":"SimulBench: Evaluating Language Models with Creative Simulation Tasks","summary":" We introduce SimulBench, a benchmark designed to evaluate large language\nmodels (LLMs) across a diverse collection of creative simulation scenarios,\nsuch as acting as a Linux terminal or playing text games with users. While\nthese simulation tasks serve as effective measures of an LLM's general\nintelligence, they are seldom incorporated into existing benchmarks. A major\nchallenge is to develop an evaluation framework for testing different LLMs\nfairly while preserving the multi-round interactive nature of simulation tasks\nbetween users and AI. To tackle this issue, we suggest using a fixed LLM as a\nuser agent to engage with an LLM to collect dialogues first under different\ntasks. Then, challenging dialogue scripts are extracted for evaluating\ndifferent target LLMs. To facilitate automatic assessment on \\DataName{}, GPT-4\nis employed as the evaluator, tasked with reviewing the quality of the final\nresponse generated by the target LLMs given multi-turn dialogue scripts. Our\ncomprehensive experiments indicate that these simulation tasks continue to pose\na significant challenge with their unique natures and show the gap between\nproprietary models and the most advanced open LLMs. For example, GPT-4-turbo\noutperforms LLaMA-3-70b-Chat on 18.55\\% more cases.\n","authors":["Qi Jia","Xiang Yue","Tianyu Zheng","Jie Huang","Bill Yuchen Lin"],"pdf_url":"https://arxiv.org/pdf/2409.07641v1.pdf","comment":"Website: https://simulbench.github.io/"},{"id":"http://arxiv.org/abs/2409.03166v2","updated":"2024-09-11T21:52:22Z","published":"2024-09-05T01:51:54Z","title":"Continual Skill and Task Learning via Dialogue","summary":" Continual and interactive robot learning is a challenging problem as the\nrobot is present with human users who expect the robot to learn novel skills to\nsolve novel tasks perpetually with sample efficiency. In this work we present a\nframework for robots to query and learn visuo-motor robot skills and task\nrelevant information via natural language dialog interactions with human users.\nPrevious approaches either focus on improving the performance of instruction\nfollowing agents, or passively learn novel skills or concepts. Instead, we used\ndialog combined with a language-skill grounding embedding to query or confirm\nskills and/or tasks requested by a user. To achieve this goal, we developed and\nintegrated three different components for our agent. Firstly, we propose a\nnovel visual-motor control policy ACT with Low Rank Adaptation (ACT-LoRA),\nwhich enables the existing SoTA ACT model to perform few-shot continual\nlearning. Secondly, we develop an alignment model that projects demonstrations\nacross skill embodiments into a shared embedding allowing us to know when to\nask questions and/or demonstrations from users. Finally, we integrated an\nexisting LLM to interact with a human user to perform grounded interactive\ncontinual skill learning to solve a task. Our ACT-LoRA model learns novel\nfine-tuned skills with a 100% accuracy when trained with only five\ndemonstrations for a novel skill while still maintaining a 74.75% accuracy on\npre-trained skills in the RLBench dataset where other models fall significantly\nshort. We also performed a human-subjects study with 8 subjects to demonstrate\nthe continual learning capabilities of our combined framework. We achieve a\nsuccess rate of 75% in the task of sandwich making with the real robot learning\nfrom participant data demonstrating that robots can learn novel skills or task\nknowledge from dialogue with non-expert users using our approach.\n","authors":["Weiwei Gu","Suresh Kondepudi","Lixiao Huang","Nakul Gopalan"],"pdf_url":"https://arxiv.org/pdf/2409.03166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07638v1","updated":"2024-09-11T21:48:33Z","published":"2024-09-11T21:48:33Z","title":"Can We Count on LLMs? The Fixed-Effect Fallacy and Claims of GPT-4\n Capabilities","summary":" In this paper we explore evaluation of LLM capabilities. We present\nmeasurements of GPT-4 performance on several deterministic tasks; each task\ninvolves a basic calculation and takes as input parameter some element drawn\nfrom a large well-defined population (e.g., count elements in a list, multiply\ntwo k-digit numbers, etc). We examine several conditions per-task and perform\nenough trials so that statistically significant differences can be detected.\nThis allows us to investigate the sensitivity of task-accuracy both to query\nphrasing and input parameter population. We find that seemingly trivial\nmodifications in the task-prompt or input population can yield differences far\nlarger than can be explained by sampling effects. For example, performance on a\nsimple list-counting task varies with query-phrasing and list-length, but also\nwith list composition (i.e., the thing-to-be-counted) and object frequency\n(e.g., success when an element accounts for $\\approx$ 50\\% of a list is\ndifferent from when it accounts for $\\approx$ 70\\% etc).\n We conclude that efforts to quantify LLM capabilities easily succumb to the\nlanguage-as-fixed-effect fallacy, where experimental observations are\nimproperly generalized beyond what the data supports. A consequence appears to\nbe that intuitions that have been formed based on interactions with humans form\na very unreliable guide as to which input modifications should ``make no\ndifference'' to LLM performance.\n","authors":["Thomas Ball","Shuo Chen","Cormac Herley"],"pdf_url":"https://arxiv.org/pdf/2409.07638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07619v5","updated":"2024-09-11T21:23:04Z","published":"2023-04-15T19:22:37Z","title":"Can ChatGPT Forecast Stock Price Movements? Return Predictability and\n Large Language Models","summary":" We document the capability of large language models (LLMs) like ChatGPT to\npredict stock price movements using news headlines, even without direct\nfinancial training. ChatGPT scores significantly predict out-of-sample daily\nstock returns, subsuming traditional methods, and predictability is stronger\namong smaller stocks and following negative news. To explain these findings, we\ndevelop a theoretical model incorporating information capacity constraints,\nunderreaction, limits-to-arbitrage, and LLMs. The model generates several key\npredictions, which we empirically test: (i) it establishes a critical threshold\nin AI capabilities necessary for profitable predictions, (ii) it demonstrates\nthat only advanced LLMs can effectively interpret complex information, and\n(iii) it predicts that widespread LLM adoption can enhance market efficiency.\nOur results suggest that sophisticated return forecasting is an emerging\ncapability of AI systems and that these technologies can alter information\ndiffusion and decision-making processes in financial markets. Finally, we\nintroduce an interpretability framework to evaluate LLMs' reasoning,\ncontributing to AI transparency and economic decision-making.\n","authors":["Alejandro Lopez-Lira","Yuehua Tang"],"pdf_url":"https://arxiv.org/pdf/2304.07619v5.pdf","comment":"Previously posted in SSRN\n https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4412788"},{"id":"http://arxiv.org/abs/2409.07627v1","updated":"2024-09-11T21:18:21Z","published":"2024-09-11T21:18:21Z","title":"Leveraging User-Generated Reviews for Recommender Systems with Dynamic\n Headers","summary":" E-commerce platforms have a vast catalog of items to cater to their\ncustomers' shopping interests. Most of these platforms assist their customers\nin the shopping process by offering optimized recommendation carousels,\ndesigned to help customers quickly locate their desired items. Many models have\nbeen proposed in academic literature to generate and enhance the ranking and\nrecall set of items in these carousels. Conventionally, the accompanying\ncarousel title text (header) of these carousels remains static. In most\ninstances, a generic text such as \"Items similar to your current viewing\" is\nutilized. Fixed variations such as the inclusion of specific attributes \"Other\nitems from a similar seller\" or \"Items from a similar brand\" in addition to\n\"frequently bought together\" or \"considered together\" are observed as well.\nThis work proposes a novel approach to customize the header generation process\nof these carousels. Our work leverages user-generated reviews that lay focus on\nspecific attributes (aspects) of an item that were favorably perceived by users\nduring their interaction with the given item. We extract these aspects from\nreviews and train a graph neural network-based model under the framework of a\nconditional ranking task. We refer to our innovative methodology as Dynamic\nText Snippets (DTS) which generates multiple header texts for an anchor item\nand its recall set. Our approach demonstrates the potential of utilizing\nuser-generated reviews and presents a unique paradigm for exploring\nincreasingly context-aware recommendation systems.\n","authors":["Shanu Vashishtha","Abhay Kumar","Lalitesh Morishetti","Kaushiki Nag","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2409.07627v1.pdf","comment":"7 pages, 3 figures, PAIS 2024 (ECAI)"},{"id":"http://arxiv.org/abs/2409.07615v1","updated":"2024-09-11T20:55:12Z","published":"2024-09-11T20:55:12Z","title":"Zero-Shot Machine-Generated Text Detection Using Mixture of Large\n Language Models","summary":" The dissemination of Large Language Models (LLMs), trained at scale, and\nendowed with powerful text-generating abilities has vastly increased the\nthreats posed by generative AI technologies by reducing the cost of producing\nharmful, toxic, faked or forged content. In response, various proposals have\nbeen made to automatically discriminate artificially generated from\nhuman-written texts, typically framing the problem as a classification problem.\nMost approaches evaluate an input document by a well-chosen detector LLM,\nassuming that low-perplexity scores reliably signal machine-made content. As\nusing one single detector can induce brittleness of performance, we instead\nconsider several and derive a new, theoretically grounded approach to combine\ntheir respective strengths. Our experiments, using a variety of generator LLMs,\nsuggest that our method effectively increases the robustness of detection.\n","authors":["Matthieu Dubois","François Yvon","Pablo Piantanida"],"pdf_url":"https://arxiv.org/pdf/2409.07615v1.pdf","comment":"Preprint, work in progress"},{"id":"http://arxiv.org/abs/2408.13359v2","updated":"2024-09-11T20:48:05Z","published":"2024-08-23T20:22:20Z","title":"Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate\n Scheduler","summary":" Finding the optimal learning rate for language model pretraining is a\nchallenging task. This is not only because there is a complicated correlation\nbetween learning rate, batch size, number of training tokens, model size, and\nother hyperparameters but also because it is prohibitively expensive to perform\na hyperparameter search for large language models with Billions or Trillions of\nparameters. Recent studies propose using small proxy models and small corpus to\nperform hyperparameter searches and transposing the optimal parameters to large\nmodels and large corpus. While the zero-shot transferability is theoretically\nand empirically proven for model size related hyperparameters, like depth and\nwidth, the zero-shot transfer from small corpus to large corpus is\nunderexplored. In this paper, we study the correlation between optimal learning\nrate, batch size, and number of training tokens for the recently proposed WSD\nscheduler. After thousands of small experiments, we found a power-law\nrelationship between variables and demonstrated its transferability across\nmodel sizes. Based on the observation, we propose a new learning rate\nscheduler, Power scheduler, that is agnostic about the number of training\ntokens and batch size. The experiment shows that combining the Power scheduler\nwith Maximum Update Parameterization (muP) can consistently achieve impressive\nperformance with one set of hyperparameters regardless of the number of\ntraining tokens, batch size, model size, and even model architecture. Our 3B\ndense and MoE models trained with the Power scheduler achieve comparable\nperformance as state-of-the-art small language models. We open-source these\npretrained models at https://ibm.biz/BdKhLa.\n","authors":["Yikang Shen","Matthew Stallone","Mayank Mishra","Gaoyuan Zhang","Shawn Tan","Aditya Prasad","Adriana Meza Soria","David D. Cox","Rameswar Panda"],"pdf_url":"https://arxiv.org/pdf/2408.13359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07440v1","updated":"2024-09-11T17:37:48Z","published":"2024-09-11T17:37:48Z","title":"SUPER: Evaluating Agents on Setting Up and Executing Tasks from Research\n Repositories","summary":" Given that Large Language Models (LLMs) have made significant progress in\nwriting code, can they now be used to autonomously reproduce results from\nresearch repositories? Such a capability would be a boon to the research\ncommunity, helping researchers validate, understand, and extend prior work. To\nadvance towards this goal, we introduce SUPER, the first benchmark designed to\nevaluate the capability of LLMs in setting up and executing tasks from research\nrepositories. SUPERaims to capture the realistic challenges faced by\nresearchers working with Machine Learning (ML) and Natural Language Processing\n(NLP) research repositories. Our benchmark comprises three distinct problem\nsets: 45 end-to-end problems with annotated expert solutions, 152 sub problems\nderived from the expert set that focus on specific challenges (e.g.,\nconfiguring a trainer), and 602 automatically generated problems for\nlarger-scale development. We introduce various evaluation measures to assess\nboth task success and progress, utilizing gold solutions when available or\napproximations otherwise. We show that state-of-the-art approaches struggle to\nsolve these problems with the best model (GPT-4o) solving only 16.3% of the\nend-to-end set, and 46.1% of the scenarios. This illustrates the challenge of\nthis task, and suggests that SUPER can serve as a valuable resource for the\ncommunity to make and measure progress.\n","authors":["Ben Bogin","Kejuan Yang","Shashank Gupta","Kyle Richardson","Erin Bransom","Peter Clark","Ashish Sabharwal","Tushar Khot"],"pdf_url":"https://arxiv.org/pdf/2409.07440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07437v1","updated":"2024-09-11T17:34:52Z","published":"2024-09-11T17:34:52Z","title":"A Suite for Acoustic Language Model Evaluation","summary":" Speech language models have recently demonstrated great potential as\nuniversal speech processing systems. Such models have the ability to model the\nrich acoustic information existing in audio signals, beyond spoken content,\nsuch as emotion, background noise, etc. Despite this, evaluation benchmarks\nwhich evaluate awareness to a wide range of acoustic aspects, are lacking. To\nhelp bridge this gap, we introduce SALMon, a novel evaluation suite\nencompassing background noise, emotion, speaker identity and room impulse\nresponse. The proposed benchmarks both evaluate the consistency of the\ninspected element and how much it matches the spoken text. We follow a\nmodelling based approach, measuring whether a model gives correct samples\nhigher scores than incorrect ones. This approach makes the benchmark fast to\ncompute even for large models. We evaluated several speech language models on\nSALMon, thus highlighting the strengths and weaknesses of each evaluated\nmethod. Code and data are publicly available at\nhttps://pages.cs.huji.ac.il/adiyoss-lab/salmon/ .\n","authors":["Gallil Maimon","Amit Roth","Yossi Adi"],"pdf_url":"https://arxiv.org/pdf/2409.07437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07431v1","updated":"2024-09-11T17:21:59Z","published":"2024-09-11T17:21:59Z","title":"Synthetic continued pretraining","summary":" Pretraining on large-scale, unstructured internet text has enabled language\nmodels to acquire a significant amount of world knowledge. However, this\nknowledge acquisition is data-inefficient -- to learn a given fact, models must\nbe trained on hundreds to thousands of diverse representations of it. This\nposes a challenge when adapting a pretrained model to a small corpus of\ndomain-specific documents, where each fact may appear rarely or only once. We\npropose to bridge this gap with synthetic continued pretraining: using the\nsmall domain-specific corpus to synthesize a large corpus more amenable to\nlearning, and then performing continued pretraining on the synthesized corpus.\nWe instantiate this proposal with EntiGraph, a synthetic data augmentation\nalgorithm that extracts salient entities from the source documents and then\ngenerates diverse text by drawing connections between the sampled entities.\nSynthetic continued pretraining using EntiGraph enables a language model to\nanswer questions and follow generic instructions related to the source\ndocuments without access to them. If instead, the source documents are\navailable at inference time, we show that the knowledge acquired through our\napproach compounds with retrieval-augmented generation. To better understand\nthese results, we build a simple mathematical model of EntiGraph, and show how\nsynthetic data augmentation can \"rearrange\" knowledge to enable more\ndata-efficient learning.\n","authors":["Zitong Yang","Neil Band","Shuangping Li","Emmanuel Candès","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2409.07431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07429v1","updated":"2024-09-11T17:21:00Z","published":"2024-09-11T17:21:00Z","title":"Agent Workflow Memory","summary":" Despite the potential of language model-based agents to solve real-world\ntasks such as web navigation, current methods still struggle with long-horizon\ntasks with complex action trajectories. In contrast, humans can flexibly solve\ncomplex tasks by learning reusable task workflows from past experiences and\nusing them to guide future actions. To build agents that can similarly benefit\nfrom this process, we introduce Agent Workflow Memory (AWM), a method for\ninducing commonly reused routines, i.e., workflows, and selectively providing\nworkflows to the agent to guide subsequent generations. AWM flexibly applies to\nboth offline and online scenarios, where agents induce workflows from training\nexamples beforehand or from test queries on the fly. We experiment on two major\nweb navigation benchmarks -- Mind2Web and WebArena -- that collectively cover\n1000+ tasks from 200+ domains across travel, shopping, and social media, among\nothers. AWM substantially improves the baseline results by 24.6% and 51.1%\nrelative success rate on Mind2Web and WebArena while reducing the number of\nsteps taken to solve WebArena tasks successfully. Furthermore, online AWM\nrobustly generalizes in cross-task, website, and domain evaluations, surpassing\nbaselines from 8.9 to 14.0 absolute points as train-test task distribution gaps\nwiden.\n","authors":["Zora Zhiruo Wang","Jiayuan Mao","Daniel Fried","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2409.07429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07424v1","updated":"2024-09-11T17:10:20Z","published":"2024-09-11T17:10:20Z","title":"Towards Fairer Health Recommendations: finding informative unbiased\n samples via Word Sense Disambiguation","summary":" There have been growing concerns around high-stake applications that rely on\nmodels trained with biased data, which consequently produce biased predictions,\noften harming the most vulnerable. In particular, biased medical data could\ncause health-related applications and recommender systems to create outputs\nthat jeopardize patient care and widen disparities in health outcomes. A recent\nframework titled Fairness via AI posits that, instead of attempting to correct\nmodel biases, researchers must focus on their root causes by using AI to debias\ndata. Inspired by this framework, we tackle bias detection in medical curricula\nusing NLP models, including LLMs, and evaluate them on a gold standard dataset\ncontaining 4,105 excerpts annotated by medical experts for bias from a large\ncorpus. We build on previous work by coauthors which augments the set of\nnegative samples with non-annotated text containing social identifier terms.\nHowever, some of these terms, especially those related to race and ethnicity,\ncan carry different meanings (e.g., \"white matter of spinal cord\"). To address\nthis issue, we propose the use of Word Sense Disambiguation models to refine\ndataset quality by removing irrelevant sentences. We then evaluate fine-tuned\nvariations of BERT models as well as GPT models with zero- and few-shot\nprompting. We found LLMs, considered SOTA on many NLP tasks, unsuitable for\nbias detection, while fine-tuned BERT models generally perform well across all\nevaluated metrics.\n","authors":["Gavin Butts","Pegah Emdad","Jethro Lee","Shannon Song","Chiman Salavati","Willmar Sosa Diaz","Shiri Dori-Hacohen","Fabricio Murai"],"pdf_url":"https://arxiv.org/pdf/2409.07424v1.pdf","comment":"Accepted for long presentation at the FAcctRec @ Recsys 2024"},{"id":"http://arxiv.org/abs/2409.07423v1","updated":"2024-09-11T17:09:49Z","published":"2024-09-11T17:09:49Z","title":"Enhancing adversarial robustness in Natural Language Inference using\n explanations","summary":" The surge of state-of-the-art Transformer-based models has undoubtedly pushed\nthe limits of NLP model performance, excelling in a variety of tasks. We cast\nthe spotlight on the underexplored task of Natural Language Inference (NLI),\nsince models trained on popular well-suited datasets are susceptible to\nadversarial attacks, allowing subtle input interventions to mislead the model.\nIn this work, we validate the usage of natural language explanation as a\nmodel-agnostic defence strategy through extensive experimentation: only by\nfine-tuning a classifier on the explanation rather than premise-hypothesis\ninputs, robustness under various adversarial attacks is achieved in comparison\nto explanation-free baselines. Moreover, since there is no standard strategy of\ntesting the semantic validity of the generated explanations, we research the\ncorrelation of widely used language generation metrics with human perception,\nin order for them to serve as a proxy towards robust NLI models. Our approach\nis resource-efficient and reproducible without significant computational\nlimitations.\n","authors":["Alexandros Koulakos","Maria Lymperaiou","Giorgos Filandrianos","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2409.07423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07402v1","updated":"2024-09-11T16:42:22Z","published":"2024-09-11T16:42:22Z","title":"What to align in multimodal contrastive learning?","summary":" Humans perceive the world through multisensory integration, blending the\ninformation of different modalities to adapt their behavior. Contrastive\nlearning offers an appealing solution for multimodal self-supervised learning.\nIndeed, by considering each modality as a different view of the same entity, it\nlearns to align features of different modalities in a shared representation\nspace. However, this approach is intrinsically limited as it only learns shared\nor redundant information between modalities, while multimodal interactions can\narise in other ways. In this work, we introduce CoMM, a Contrastive MultiModal\nlearning strategy that enables the communication between modalities in a single\nmultimodal space. Instead of imposing cross- or intra- modality constraints, we\npropose to align multimodal representations by maximizing the mutual\ninformation between augmented versions of these multimodal features. Our\ntheoretical analysis shows that shared, synergistic and unique terms of\ninformation naturally emerge from this formulation, allowing us to estimate\nmultimodal interactions beyond redundancy. We test CoMM both in a controlled\nand in a series of real-world settings: in the former, we demonstrate that CoMM\neffectively captures redundant, unique and synergistic information between\nmodalities. In the latter, CoMM learns complex multimodal interactions and\nachieves state-of-the-art results on the six multimodal benchmarks.\n","authors":["Benoit Dufumier","Javiera Castillo-Navarro","Devis Tuia","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2409.07402v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2409.07394v1","updated":"2024-09-11T16:35:18Z","published":"2024-09-11T16:35:18Z","title":"AdaCAD: Adaptively Decoding to Balance Conflicts between Contextual and\n Parametric Knowledge","summary":" Knowledge conflict arises from discrepancies between information in the\ncontext of a large language model (LLM) and the knowledge stored in its\nparameters. This can hurt performance when using standard decoding techniques,\nwhich tend to ignore the context. Existing test-time contrastive methods seek\nto address this by comparing the LLM's output distribution with and without the\ncontext and adjust the model according to the contrast between them. However,\nwe find that these methods frequently misjudge the degree of conflict and\nstruggle to handle instances that vary in their amount of conflict, with static\nmethods over-adjusting when conflict is absent. We propose a fine-grained,\ninstance-level approach called AdaCAD, which dynamically infers the weight of\nadjustment based on the degree of conflict, as measured by the Jensen-Shannon\ndivergence between distributions representing contextual and parametric\nknowledge. Our experiments across four models on six diverse question-answering\n(QA) datasets and three summarization tasks demonstrate that our training-free\nadaptive method consistently outperforms other decoding methods on QA, with\naverage accuracy gains of 14.21% (absolute) over a static contrastive baseline,\nand improves the factuality of summaries by 5.59 (AlignScore). Furthermore, our\nanalysis shows that while decoding with contrastive baselines hurts performance\nwhen conflict is absent, AdaCAD mitigates these losses, making it more\napplicable to real-world datasets in which some examples have conflict and\nothers do not.\n","authors":["Han Wang","Archiki Prasad","Elias Stengel-Eskin","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2409.07394v1.pdf","comment":"16 pages, Code: https://github.com/HanNight/AdaCAD"},{"id":"http://arxiv.org/abs/2409.02076v3","updated":"2024-09-11T16:35:00Z","published":"2024-09-03T17:25:54Z","title":"LongGenbench: Benchmarking Long-Form Generation in Long Context LLMs","summary":" The abilities of long-context language models (LMs) are often evaluated using\nthe \"Needle-in-a-Haystack\" (NIAH) test, which comprises tasks designed to\nassess a model's ability to identify specific information (\"needle\") within\nlarge text sequences (\"haystack\"). While these benchmarks measure how well\nmodels understand long-context input sequences, they do not effectively gauge\nthe quality of long-form text generation--a critical aspect for applications\nsuch as design proposals and creative writing. To address this gap, we have\nintroduced a new long-form text evaluation benchmark, LongGenbench, which tests\nmodels' ability to identify specific events within generated long text\nsequences. In this benchmark, we prompt long-context LMs to create long-form\ntext that must include particular events or constraints and evaluate their\nability to incorporate these elements. We evaluated ten long-context LMs across\nfour distinct scenarios, three types of prompt instructions, and two different\ngeneration-length settings (16K and 32K). Although these models perform well on\nNIAH benchmarks, none demonstrated satisfactory performance on the\nLongGenbench, raising concerns about their ability to generate coherent\nlong-form text that follows instructions. Additionally, as the length of the\ngenerated text increases, all models exhibit a significant drop in performance.\n","authors":["Yuhao Wu","Ming Shan Hee","Zhiqing Hu","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2409.02076v3.pdf","comment":"work in progress. arXiv admin note: text overlap with\n arXiv:2404.06654 by other authors"},{"id":"http://arxiv.org/abs/2402.00711v3","updated":"2024-09-11T16:32:15Z","published":"2024-02-01T16:06:35Z","title":"Explaining Text Classifiers with Counterfactual Representations","summary":" One well motivated explanation method for classifiers leverages\ncounterfactuals which are hypothetical events identical to real observations in\nall aspects except for one feature. Constructing such counterfactual poses\nspecific challenges for texts, however, as some attribute values may not\nnecessarily align with plausible real-world events. In this paper we propose a\nsimple method for generating counterfactuals by intervening in the space of\ntext representations which bypasses this limitation. We argue that our\ninterventions are minimally disruptive and that they are theoretically sound as\nthey align with counterfactuals as defined in Pearl's causal inference\nframework. To validate our method, we conducted experiments first on a\nsynthetic dataset and then on a realistic dataset of counterfactuals. This\nallows for a direct comparison between classifier predictions based on ground\ntruth counterfactuals - obtained through explicit text interventions - and our\ncounterfactuals, derived through interventions in the representation space.\nEventually, we study a real world scenario where our counterfactuals can be\nleveraged both for explaining a classifier and for bias mitigation.\n","authors":["Pirmin Lemberger","Antoine Saillenfest"],"pdf_url":"https://arxiv.org/pdf/2402.00711v3.pdf","comment":"24 pages, 4 figures, accepted for publication in ECAI 2024"},{"id":"http://arxiv.org/abs/2402.18334v3","updated":"2024-09-11T16:28:29Z","published":"2024-02-28T13:54:57Z","title":"Learning to Generate Instruction Tuning Datasets for Zero-Shot Task\n Adaptation","summary":" We introduce Bonito, an open-source model for conditional task generation\nthat converts unannotated text into task-specific training datasets for\ninstruction tuning. We aim to enable zero-shot task adaptation of large\nlanguage models on users' specialized, private data. We train Bonito by\nfine-tuning a pretrained large language model on a new large-scale dataset with\n1.65M examples created by remixing existing instruction tuning datasets into\nmeta-templates. The meta-templates for a dataset produce training examples\nwhere the input is the unannotated text and the task attribute and the output\nconsists of the instruction and the response. We use Bonito to generate\nsynthetic tasks for seven datasets from specialized domains with unannotated\ntext across three task types -- yes-no question answering, extractive question\nanswering, and natural language inference -- and adapt language models. We show\nthat Bonito significantly improves the average performance of pretrained and\ninstruction tuned models over the de facto self supervised baseline. For\nexample, adapting Mistral-Instruct-v2 and instruction tuned variants of Mistral\nand Llama2 with Bonito improves the strong zero-shot performance by 22.1 F1\npoints whereas the next word prediction objective undoes some of the benefits\nof instruction tuning and reduces the average performance by 0.8 F1 points. We\nconduct additional experiments with Bonito to understand the effects of the\ndomain, the size of the training set, and the choice of alternative synthetic\ntask generators. Overall, we show that learning with synthetic instruction\ntuning datasets is an effective way to adapt language models to new domains.\nThe model, dataset, and code are available at\nhttps://github.com/BatsResearch/bonito.\n","authors":["Nihal V. Nayak","Yiyang Nan","Avi Trost","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2402.18334v3.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2409.07388v1","updated":"2024-09-11T16:24:06Z","published":"2024-09-11T16:24:06Z","title":"Recent Trends of Multimodal Affective Computing: A Survey from NLP\n Perspective","summary":" Multimodal affective computing (MAC) has garnered increasing attention due to\nits broad applications in analyzing human behaviors and intentions, especially\nin text-dominated multimodal affective computing field. This survey presents\nthe recent trends of multimodal affective computing from NLP perspective\nthrough four hot tasks: multimodal sentiment analysis, multimodal emotion\nrecognition in conversation, multimodal aspect-based sentiment analysis and\nmultimodal multi-label emotion recognition. The goal of this survey is to\nexplore the current landscape of multimodal affective research, identify\ndevelopment trends, and highlight the similarities and differences across\nvarious tasks, offering a comprehensive report on the recent progress in\nmultimodal affective computing from an NLP perspective. This survey covers the\nformalization of tasks, provides an overview of relevant works, describes\nbenchmark datasets, and details the evaluation metrics for each task.\nAdditionally, it briefly discusses research in multimodal affective computing\ninvolving facial expressions, acoustic signals, physiological signals, and\nemotion causes. Additionally, we discuss the technical approaches, challenges,\nand future directions in multimodal affective computing. To support further\nresearch, we released a repository that compiles related works in multimodal\naffective computing, providing detailed resources and references for the\ncommunity.\n","authors":["Guimin Hu","Yi Xin","Weimin Lyu","Haojian Huang","Chang Sun","Zhihong Zhu","Lin Gui","Ruichu Cai"],"pdf_url":"https://arxiv.org/pdf/2409.07388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07372v1","updated":"2024-09-11T16:03:09Z","published":"2024-09-11T16:03:09Z","title":"Awaking the Slides: A Tuning-free and Knowledge-regulated AI Tutoring\n System via Language Model Coordination","summary":" The vast pre-existing slides serve as rich and important materials to carry\nlecture knowledge. However, effectively leveraging lecture slides to serve\nstudents is difficult due to the multi-modal nature of slide content and the\nheterogeneous teaching actions. We study the problem of discovering effective\ndesigns that convert a slide into an interactive lecture. We develop\nSlide2Lecture, a tuning-free and knowledge-regulated intelligent tutoring\nsystem that can (1) effectively convert an input lecture slide into a\nstructured teaching agenda consisting of a set of heterogeneous teaching\nactions; (2) create and manage an interactive lecture that generates responsive\ninteractions catering to student learning demands while regulating the\ninteractions to follow teaching actions. Slide2Lecture contains a complete\npipeline for learners to obtain an interactive classroom experience to learn\nthe slide. For teachers and developers, Slide2Lecture enables customization to\ncater to personalized demands. The evaluation rated by annotators and students\nshows that Slide2Lecture is effective in outperforming the remaining\nimplementation. Slide2Lecture's online deployment has made more than 200K\ninteraction with students in the 3K lecture sessions. We open source\nSlide2Lecture's implementation in\nhttps://anonymous.4open.science/r/slide2lecture-4210/.\n","authors":["Daniel Zhang-Li","Zheyuan Zhang","Jifan Yu","Joy Lim Jia Yin","Shangqing Tu","Linlu Gong","Haohua Wang","Zhiyuan Liu","Huiqin Liu","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2409.07372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13764v4","updated":"2024-09-11T15:47:11Z","published":"2024-02-21T12:38:59Z","title":"CriticEval: Evaluating Large Language Model as Critic","summary":" Critique ability, i.e., the capability of Large Language Models (LLMs) to\nidentify and rectify flaws in responses, is crucial for their applications in\nself-improvement and scalable oversight. While numerous studies have been\nproposed to evaluate critique ability of LLMs, their comprehensiveness and\nreliability are still limited. To overcome this problem, we introduce\nCriticEval, a novel benchmark designed to comprehensively and reliably evaluate\ncritique ability of LLMs. Specifically, to ensure the comprehensiveness,\nCriticEval evaluates critique ability from four dimensions across nine diverse\ntask scenarios. It evaluates both scalar-valued and textual critiques,\ntargeting responses of varying quality. To ensure the reliability, a large\nnumber of critiques are annotated to serve as references, enabling GPT-4 to\nevaluate textual critiques reliably. Extensive evaluations of open-source and\nclosed-source LLMs first validate the reliability of evaluation in CriticEval.\nThen, experimental results demonstrate the promising potential of open-source\nLLMs, the effectiveness of critique datasets and several intriguing\nrelationships between the critique ability and some critical factors, including\ntask types, response qualities and critique dimensions. Datasets and evaluation\ntoolkit for CriticEval will be publicly released.\n","authors":["Tian Lan","Wenwei Zhang","Chen Xu","Heyan Huang","Dahua Lin","Kai Chen","Xian-ling Mao"],"pdf_url":"https://arxiv.org/pdf/2402.13764v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07355v1","updated":"2024-09-11T15:40:07Z","published":"2024-09-11T15:40:07Z","title":"Think Together and Work Better: Combining Humans' and LLMs' Think-Aloud\n Outcomes for Effective Text Evaluation","summary":" This study introduces \\textbf{InteractEval}, a framework that integrates\nhuman expertise and Large Language Models (LLMs) using the Think-Aloud (TA)\nmethod to generate attributes for checklist-based text evaluation. By combining\nhuman flexibility and reasoning with LLM consistency, InteractEval outperforms\ntraditional non-LLM-based and LLM-based baselines across four distinct\ndimensions, consisting of Coherence, Fluency, Consistency, and Relevance. The\nexperiment also investigates the effectiveness of the TA method, showing that\nit promotes divergent thinking in both humans and LLMs, leading to the\ngeneration of a wider range of relevant attributes and enhance text evaluation\nperformance. Comparative analysis reveals that humans excel at identifying\nattributes related to internal quality (Coherence and Fluency), but LLMs\nperform better at those attributes related to external alignment (Consistency\nand Relevance). Consequently, leveraging both humans and LLMs together produces\nthe best evaluation outcomes. In other words, this study emphasizes the\nnecessity of effectively combining humans and LLMs in an automated\nchecklist-based text evaluation framework. The code is available at\n\\textbf{\\url{https://github.com/BBeeChu/InteractEval.git}}.\n","authors":["SeongYeub Chu","JongWoo Kim","MunYong Yi"],"pdf_url":"https://arxiv.org/pdf/2409.07355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07335v1","updated":"2024-09-11T15:16:25Z","published":"2024-09-11T15:16:25Z","title":"Explanation, Debate, Align: A Weak-to-Strong Framework for Language\n Model Generalization","summary":" The rapid advancement of artificial intelligence systems has brought the\nchallenge of AI alignment to the forefront of research, particularly in complex\ndecision-making and task execution. As these systems surpass human-level\nperformance in sophisticated problems, ensuring their alignment with human\nvalues, intentions, and ethical guidelines becomes crucial. Building on\nprevious work in explanation generation for human-agent alignment, we address\nthe more complex dynamics of multi-agent systems and human-AI teams. This paper\nintroduces a novel approach to model alignment through weak-to-strong\ngeneralization in the context of language models. We present a framework where\na strong model facilitates the improvement of a weaker model, bridging the gap\nbetween explanation generation and model alignment. Our method, formalized as a\nfacilitation function, allows for the transfer of capabilities from advanced\nmodels to less capable ones without direct access to extensive training data.\nOur results suggest that this facilitation-based approach not only enhances\nmodel performance but also provides insights into the nature of model alignment\nand the potential for scalable oversight of AI systems.\n","authors":["Mehrdad Zakershahrak","Samira Ghodratnama"],"pdf_url":"https://arxiv.org/pdf/2409.07335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04916v3","updated":"2024-09-11T14:59:21Z","published":"2023-11-02T04:01:04Z","title":"Explainable Identification of Hate Speech towards Islam using Graph\n Neural Networks","summary":" Islamophobic language on online platforms fosters intolerance, making\ndetection and elimination crucial for promoting harmony. Traditional hate\nspeech detection models rely on NLP techniques like tokenization,\npart-of-speech tagging, and encoder-decoder models. However, Graph Neural\nNetworks (GNNs), with their ability to utilize relationships between data\npoints, offer more effective detection and greater explainability. In this\nwork, we represent speeches as nodes and connect them with edges based on their\ncontext and similarity to develop the graph. This study introduces a novel\nparadigm using GNNs to identify and explain hate speech towards Islam. Our\nmodel leverages GNNs to understand the context and patterns of hate speech by\nconnecting texts via pretrained NLP-generated word embeddings, achieving\nstate-of-the-art performance and enhancing detection accuracy while providing\nvaluable explanations. This highlights the potential of GNNs in combating\nonline hate speech and fostering a safer, more inclusive online environment.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2311.04916v3.pdf","comment":"Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (non-archival)\n (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP\n 2024 : NLP for Positive Impact Workshop (archival)"},{"id":"http://arxiv.org/abs/2409.07314v1","updated":"2024-09-11T14:44:51Z","published":"2024-09-11T14:44:51Z","title":"MEDIC: Towards a Comprehensive Framework for Evaluating LLMs in Clinical\n Applications","summary":" The rapid development of Large Language Models (LLMs) for healthcare\napplications has spurred calls for holistic evaluation beyond frequently-cited\nbenchmarks like USMLE, to better reflect real-world performance. While\nreal-world assessments are valuable indicators of utility, they often lag\nbehind the pace of LLM evolution, likely rendering findings obsolete upon\ndeployment. This temporal disconnect necessitates a comprehensive upfront\nevaluation that can guide model selection for specific clinical applications.\nWe introduce MEDIC, a framework assessing LLMs across five critical dimensions\nof clinical competence: medical reasoning, ethics and bias, data and language\nunderstanding, in-context learning, and clinical safety. MEDIC features a novel\ncross-examination framework quantifying LLM performance across areas like\ncoverage and hallucination detection, without requiring reference outputs. We\napply MEDIC to evaluate LLMs on medical question-answering, safety,\nsummarization, note generation, and other tasks. Our results show performance\ndisparities across model sizes, baseline vs medically finetuned models, and\nhave implications on model selection for applications requiring specific model\nstrengths, such as low hallucination or lower cost of inference. MEDIC's\nmultifaceted evaluation reveals these performance trade-offs, bridging the gap\nbetween theoretical capabilities and practical implementation in healthcare\nsettings, ensuring that the most promising models are identified and adapted\nfor diverse healthcare applications.\n","authors":["Praveen K Kanithi","Clément Christophe","Marco AF Pimentel","Tathagata Raha","Nada Saadi","Hamza Javed","Svetlana Maslenkova","Nasir Hayat","Ronnie Rajan","Shadab Khan"],"pdf_url":"https://arxiv.org/pdf/2409.07314v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2404.05993v2","updated":"2024-09-11T14:42:29Z","published":"2024-04-09T03:54:28Z","title":"AEGIS: Online Adaptive AI Content Safety Moderation with Ensemble of LLM\n Experts","summary":" As Large Language Models (LLMs) and generative AI become more widespread, the\ncontent safety risks associated with their use also increase. We find a notable\ndeficiency in high-quality content safety datasets and benchmarks that\ncomprehensively cover a wide range of critical safety areas. To address this,\nwe define a broad content safety risk taxonomy, comprising 13 critical risk and\n9 sparse risk categories. Additionally, we curate AEGISSAFETYDATASET, a new\ndataset of approximately 26, 000 human-LLM interaction instances, complete with\nhuman annotations adhering to the taxonomy. We plan to release this dataset to\nthe community to further research and to help benchmark LLM models for safety.\nTo demonstrate the effectiveness of the dataset, we instruction-tune multiple\nLLM-based safety models. We show that our models (named AEGISSAFETYEXPERTS),\nnot only surpass or perform competitively with the state-of-the-art LLM-based\nsafety models and general purpose LLMs, but also exhibit robustness across\nmultiple jail-break attack categories. We also show how using\nAEGISSAFETYDATASET during the LLM alignment phase does not negatively impact\nthe performance of the aligned models on MT Bench scores. Furthermore, we\npropose AEGIS, a novel application of a no-regret online adaptation framework\nwith strong theoretical guarantees, to perform content moderation with an\nensemble of LLM content safety experts in deployment\n","authors":["Shaona Ghosh","Prasoon Varshney","Erick Galinkin","Christopher Parisien"],"pdf_url":"https://arxiv.org/pdf/2404.05993v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07286v1","updated":"2024-09-11T14:14:15Z","published":"2024-09-11T14:14:15Z","title":"Using Generative Agents to Create Tip Sheets for Investigative Data\n Reporting","summary":" This paper introduces a system using generative AI agents to create tip\nsheets for investigative data reporting. Our system employs three specialized\nagents--an analyst, a reporter, and an editor--to collaboratively generate and\nrefine tips from datasets. We validate this approach using real-world\ninvestigative stories, demonstrating that our agent-based system generally\ngenerates more newsworthy and accurate insights compared to a baseline model\nwithout agents, although some variability was noted between different stories.\nOur findings highlight the potential of generative AI to provide leads for\ninvestigative data reporting.\n","authors":["Joris Veerbeek","Nicholas Diakopoulos"],"pdf_url":"https://arxiv.org/pdf/2409.07286v1.pdf","comment":"Short paper to be presented at Computation + Journalism 2024"},{"id":"http://arxiv.org/abs/2409.07265v1","updated":"2024-09-11T13:40:27Z","published":"2024-09-11T13:40:27Z","title":"Cross-Dialect Text-To-Speech in Pitch-Accent Language Incorporating\n Multi-Dialect Phoneme-Level BERT","summary":" We explore cross-dialect text-to-speech (CD-TTS), a task to synthesize\nlearned speakers' voices in non-native dialects, especially in pitch-accent\nlanguages. CD-TTS is important for developing voice agents that naturally\ncommunicate with people across regions. We present a novel TTS model comprising\nthree sub-modules to perform competitively at this task. We first train a\nbackbone TTS model to synthesize dialect speech from a text conditioned on\nphoneme-level accent latent variables (ALVs) extracted from speech by a\nreference encoder. Then, we train an ALV predictor to predict ALVs tailored to\na target dialect from input text leveraging our novel multi-dialect\nphoneme-level BERT. We conduct multi-dialect TTS experiments and evaluate the\neffectiveness of our model by comparing it with a baseline derived from\nconventional dialect TTS methods. The results show that our model improves the\ndialectal naturalness of synthetic speech in CD-TTS.\n","authors":["Kazuki Yamauchi","Yuki Saito","Hiroshi Saruwatari"],"pdf_url":"https://arxiv.org/pdf/2409.07265v1.pdf","comment":"Accepted by IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.07246v1","updated":"2024-09-11T13:04:34Z","published":"2024-09-11T13:04:34Z","title":"Propaganda to Hate: A Multimodal Analysis of Arabic Memes with\n Multi-Agent LLMs","summary":" In the past decade, social media platforms have been used for information\ndissemination and consumption. While a major portion of the content is posted\nto promote citizen journalism and public awareness, some content is posted to\nmislead users. Among different content types such as text, images, and videos,\nmemes (text overlaid on images) are particularly prevalent and can serve as\npowerful vehicles for propaganda, hate, and humor. In the current literature,\nthere have been efforts to individually detect such content in memes. However,\nthe study of their intersection is very limited. In this study, we explore the\nintersection between propaganda and hate in memes using a multi-agent LLM-based\napproach. We extend the propagandistic meme dataset with coarse and\nfine-grained hate labels. Our finding suggests that there is an association\nbetween propaganda and hate in memes. We provide detailed experimental results\nthat can serve as a baseline for future studies. We will make the experimental\nresources publicly available to the community.\n","authors":["Firoj Alam","Md. Rafiul Biswas","Uzair Shah","Wajdi Zaghouani","Georgios Mikros"],"pdf_url":"https://arxiv.org/pdf/2409.07246v1.pdf","comment":"propaganda, hate-speech, disinformation, misinformation, fake news,\n LLMs, GPT-4, multimodality, multimodal LLMs"},{"id":"http://arxiv.org/abs/2307.01310v2","updated":"2024-09-11T12:33:01Z","published":"2023-07-03T19:30:24Z","title":"Leveraging Cross-Lingual Transfer Learning in Spoken Named Entity\n Recognition Systems","summary":" Recent Named Entity Recognition (NER) advancements have significantly\nenhanced text classification capabilities. This paper focuses on spoken NER,\naimed explicitly at spoken document retrieval, an area not widely studied due\nto the lack of comprehensive datasets for spoken contexts. Additionally, the\npotential for cross-lingual transfer learning in low-resource situations\ndeserves further investigation. In our study, we applied transfer learning\ntechniques across Dutch, English, and German using both pipeline and End-to-End\n(E2E) approaches. We employed Wav2Vec2 XLS-R models on custom pseudo-annotated\ndatasets to evaluate the adaptability of cross-lingual systems. Our exploration\nof different architectural configurations assessed the robustness of these\nsystems in spoken NER. Results showed that the E2E model was superior to the\npipeline model, particularly with limited annotation resources. Furthermore,\ntransfer learning from German to Dutch improved performance by 7% over the\nstandalone Dutch E2E system and 4% over the Dutch pipeline model. Our findings\nhighlight the effectiveness of cross-lingual transfer in spoken NER and\nemphasize the need for additional data collection to improve these systems.\n","authors":["Moncef Benaicha","David Thulke","M. A. Tuğtekin Turan"],"pdf_url":"https://arxiv.org/pdf/2307.01310v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01193v2","updated":"2024-09-11T12:29:02Z","published":"2024-09-02T11:59:56Z","title":"CLIBE: Detecting Dynamic Backdoors in Transformer-based NLP Models","summary":" Backdoors can be injected into NLP models to induce misbehavior when the\ninput text contains a specific feature, known as a trigger, which the attacker\nsecretly selects. Unlike fixed words, phrases, or sentences used in the static\ntext trigger, NLP dynamic backdoor attacks design triggers associated with\nabstract and latent text features, making them considerably stealthier than\ntraditional static backdoor attacks. However, existing research on NLP backdoor\ndetection primarily focuses on defending against static backdoor attacks, while\ndetecting dynamic backdoors in NLP models remains largely unexplored. This\npaper presents CLIBE, the first framework to detect dynamic backdoors in\nTransformer-based NLP models. CLIBE injects a \"few-shot perturbation\" into the\nsuspect Transformer model by crafting optimized weight perturbation in the\nattention layers to make the perturbed model classify a limited number of\nreference samples as a target label. Subsequently, CLIBE leverages the\ngeneralization ability of this few-shot perturbation to determine whether the\noriginal model contains a dynamic backdoor. Extensive evaluation on three\nadvanced NLP dynamic backdoor attacks, two widely-used Transformer frameworks,\nand four real-world classification tasks strongly validates the effectiveness\nof CLIBE. We also demonstrate the robustness of CLIBE against various adaptive\nattacks. Furthermore, we employ CLIBE to scrutinize 49 popular Transformer\nmodels on Hugging Face and discover one exhibiting a high probability of\ncontaining a dynamic backdoor. We have contacted Hugging Face and provided\ndetailed evidence of this model's backdoor behavior. Moreover, we extend CLIBE\nto detect backdoor text generation models modified to exhibit toxic behavior.\nTo the best of our knowledge, CLIBE is the first framework capable of detecting\nbackdoors in text generation models without access to trigger input test\nsamples.\n","authors":["Rui Zeng","Xi Chen","Yuwen Pu","Xuhong Zhang","Tianyu Du","Shouling Ji"],"pdf_url":"https://arxiv.org/pdf/2409.01193v2.pdf","comment":"To appear in the Network and Distributed System Security (NDSS)\n Symposium, February, 2025"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.01137v3","updated":"2024-09-11T22:59:34Z","published":"2024-09-02T10:19:31Z","title":"Smart E-commerce Recommendations with Semantic AI","summary":" In e-commerce, web mining for page recommendations is widely used but often\nfails to meet user needs. To address this, we propose a novel solution\ncombining semantic web mining with BP neural networks. We process user search\nlogs to extract five key features: content priority, time spent, user feedback,\nrecommendation semantics, and input deviation. These features are then fed into\na BP neural network to classify and prioritize web pages. The prioritized pages\nare recommended to users. Using book sales pages for testing, our results\ndemonstrate that this solution can quickly and accurately identify the pages\nusers need. Our approach ensures that recommendations are more relevant and\ntailored to individual preferences, enhancing the online shopping experience.\nBy leveraging advanced semantic analysis and neural network techniques, we\nbridge the gap between user expectations and actual recommendations. This\ninnovative method not only improves accuracy but also speeds up the\nrecommendation process, making it a valuable tool for e-commerce platforms\naiming to boost user satisfaction and engagement. Additionally, our system\nability to handle large datasets and provide real-time recommendations makes it\na scalable and efficient solution for modern e-commerce challenges.\n","authors":["M. Badouch","M. Boutaounte"],"pdf_url":"https://arxiv.org/pdf/2409.01137v3.pdf","comment":"My paper contain some errors"},{"id":"http://arxiv.org/abs/2409.07627v1","updated":"2024-09-11T21:18:21Z","published":"2024-09-11T21:18:21Z","title":"Leveraging User-Generated Reviews for Recommender Systems with Dynamic\n Headers","summary":" E-commerce platforms have a vast catalog of items to cater to their\ncustomers' shopping interests. Most of these platforms assist their customers\nin the shopping process by offering optimized recommendation carousels,\ndesigned to help customers quickly locate their desired items. Many models have\nbeen proposed in academic literature to generate and enhance the ranking and\nrecall set of items in these carousels. Conventionally, the accompanying\ncarousel title text (header) of these carousels remains static. In most\ninstances, a generic text such as \"Items similar to your current viewing\" is\nutilized. Fixed variations such as the inclusion of specific attributes \"Other\nitems from a similar seller\" or \"Items from a similar brand\" in addition to\n\"frequently bought together\" or \"considered together\" are observed as well.\nThis work proposes a novel approach to customize the header generation process\nof these carousels. Our work leverages user-generated reviews that lay focus on\nspecific attributes (aspects) of an item that were favorably perceived by users\nduring their interaction with the given item. We extract these aspects from\nreviews and train a graph neural network-based model under the framework of a\nconditional ranking task. We refer to our innovative methodology as Dynamic\nText Snippets (DTS) which generates multiple header texts for an anchor item\nand its recall set. Our approach demonstrates the potential of utilizing\nuser-generated reviews and presents a unique paradigm for exploring\nincreasingly context-aware recommendation systems.\n","authors":["Shanu Vashishtha","Abhay Kumar","Lalitesh Morishetti","Kaushiki Nag","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2409.07627v1.pdf","comment":"7 pages, 3 figures, PAIS 2024 (ECAI)"},{"id":"http://arxiv.org/abs/2409.07604v1","updated":"2024-09-11T20:31:42Z","published":"2024-09-11T20:31:42Z","title":"Multilingual Prompts in LLM-Based Recommenders: Performance Across\n Languages","summary":" Large language models (LLMs) are increasingly used in natural language\nprocessing tasks. Recommender systems traditionally use methods such as\ncollaborative filtering and matrix factorization, as well as advanced\ntechniques like deep learning and reinforcement learning. Although language\nmodels have been applied in recommendation, the recent trend have focused on\nleveraging the generative capabilities of LLMs for more personalized\nsuggestions. While current research focuses on English due to its resource\nrichness, this work explores the impact of non-English prompts on\nrecommendation performance. Using OpenP5, a platform for developing and\nevaluating LLM-based recommendations, we expanded its English prompt templates\nto include Spanish and Turkish. Evaluation on three real-world datasets, namely\nML1M, LastFM, and Amazon-Beauty, showed that usage of non-English prompts\ngenerally reduce performance, especially in less-resourced languages like\nTurkish. We also retrained an LLM-based recommender model with multilingual\nprompts to analyze performance variations. Retraining with multilingual prompts\nresulted in more balanced performance across languages, but slightly reduced\nEnglish performance. This work highlights the need for diverse language support\nin LLM-based recommenders and suggests future research on creating evaluation\ndatasets, using newer models and additional languages.\n","authors":["Makbule Gulcin Ozsoy"],"pdf_url":"https://arxiv.org/pdf/2409.07604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07433v1","updated":"2024-09-11T17:27:04Z","published":"2024-09-11T17:27:04Z","title":"Dot Product is All You Need: Bridging the Gap Between Item\n Recommendation and Link Prediction","summary":" Item recommendation (the task of predicting if a user may interact with new\nitems from the catalogue in a recommendation system) and link prediction (the\ntask of identifying missing links in a knowledge graph) have long been regarded\nas distinct problems. In this work, we show that the item recommendation\nproblem can be seen as an instance of the link prediction problem, where\nentities in the graph represent users and items, and the task consists of\npredicting missing instances of the relation type <>. In a\npreliminary attempt to demonstrate the assumption, we decide to test three\npopular factorisation-based link prediction models on the item recommendation\ntask, showing that their predictive accuracy is competitive with ten\nstate-of-the-art recommendation models. The purpose is to show how the former\nmay be seamlessly and effectively applied to the recommendation task without\nany specific modification to their architectures. Finally, while beginning to\nunveil the key reasons behind the recommendation performance of the selected\nlink prediction models, we explore different settings for their hyper-parameter\nvalues, paving the way for future directions.\n","authors":["Daniele Malitesta","Alberto Carlo Maria Mancino","Pasquale Minervini","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2409.07433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07416v1","updated":"2024-09-11T17:01:06Z","published":"2024-09-11T17:01:06Z","title":"Hierarchical Reinforcement Learning for Temporal Abstraction of Listwise\n Recommendation","summary":" Modern listwise recommendation systems need to consider both long-term user\nperceptions and short-term interest shifts. Reinforcement learning can be\napplied on recommendation to study such a problem but is also subject to large\nsearch space, sparse user feedback and long interactive latency. Motivated by\nrecent progress in hierarchical reinforcement learning, we propose a novel\nframework called mccHRL to provide different levels of temporal abstraction on\nlistwise recommendation. Within the hierarchical framework, the high-level\nagent studies the evolution of user perception, while the low-level agent\nproduces the item selection policy by modeling the process as a sequential\ndecision-making problem. We argue that such framework has a well-defined\ndecomposition of the outra-session context and the intra-session context, which\nare encoded by the high-level and low-level agents, respectively. To verify\nthis argument, we implement both a simulator-based environment and an\nindustrial dataset-based experiment. Results observe significant performance\nimprovement by our method, compared with several well-known baselines. Data and\ncodes have been made public.\n","authors":["Luo Ji","Gao Liu","Mingyang Yin","Hongxia Yang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.07416v1.pdf","comment":"18 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.07367v1","updated":"2024-09-11T15:56:05Z","published":"2024-09-11T15:56:05Z","title":"Enhancing Sequential Music Recommendation with Negative\n Feedback-informed Contrastive Learning","summary":" Modern music streaming services are heavily based on recommendation engines\nto serve content to users. Sequential recommendation -- continuously providing\nnew items within a single session in a contextually coherent manner -- has been\nan emerging topic in current literature. User feedback -- a positive or\nnegative response to the item presented -- is used to drive content\nrecommendations by learning user preferences. We extend this idea to\nsession-based recommendation to provide context-coherent music recommendations\nby modelling negative user feedback, i.e., skips, in the loss function. We\npropose a sequence-aware contrastive sub-task to structure item embeddings in\nsession-based music recommendation, such that true next-positive items\n(ignoring skipped items) are structured closer in the session embedding space,\nwhile skipped tracks are structured farther away from all items in the session.\nThis directly affects item rankings using a K-nearest-neighbors search for\nnext-item recommendations, while also promoting the rank of the true next item.\nExperiments incorporating this task into SoTA methods for sequential item\nrecommendation show consistent performance gains in terms of next-item hit\nrate, item ranking, and skip down-ranking on three music recommendation\ndatasets, strongly benefiting from the increasing presence of user feedback.\n","authors":["Pavan Seshadri","Shahrzad Shashaani","Peter Knees"],"pdf_url":"https://arxiv.org/pdf/2409.07367v1.pdf","comment":"To-appear at 18th ACM Conference on Recommendation Systems"},{"id":"http://arxiv.org/abs/2311.04916v3","updated":"2024-09-11T14:59:21Z","published":"2023-11-02T04:01:04Z","title":"Explainable Identification of Hate Speech towards Islam using Graph\n Neural Networks","summary":" Islamophobic language on online platforms fosters intolerance, making\ndetection and elimination crucial for promoting harmony. Traditional hate\nspeech detection models rely on NLP techniques like tokenization,\npart-of-speech tagging, and encoder-decoder models. However, Graph Neural\nNetworks (GNNs), with their ability to utilize relationships between data\npoints, offer more effective detection and greater explainability. In this\nwork, we represent speeches as nodes and connect them with edges based on their\ncontext and similarity to develop the graph. This study introduces a novel\nparadigm using GNNs to identify and explain hate speech towards Islam. Our\nmodel leverages GNNs to understand the context and patterns of hate speech by\nconnecting texts via pretrained NLP-generated word embeddings, achieving\nstate-of-the-art performance and enhancing detection accuracy while providing\nvaluable explanations. This highlights the potential of GNNs in combating\nonline hate speech and fostering a safer, more inclusive online environment.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2311.04916v3.pdf","comment":"Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (non-archival)\n (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP\n 2024 : NLP for Positive Impact Workshop (archival)"},{"id":"http://arxiv.org/abs/2403.17372v4","updated":"2024-09-11T14:56:27Z","published":"2024-03-26T04:16:57Z","title":"An Empirical Study of Training ID-Agnostic Multi-modal Sequential\n Recommenders","summary":" Sequential Recommendation (SR) aims to predict future user-item interactions\nbased on historical interactions. While many SR approaches concentrate on user\nIDs and item IDs, the human perception of the world through multi-modal\nsignals, like text and images, has inspired researchers to delve into\nconstructing SR from multi-modal information without using IDs. However, the\ncomplexity of multi-modal learning manifests in diverse feature extractors,\nfusion methods, and pre-trained models. Consequently, designing a simple and\nuniversal \\textbf{M}ulti-\\textbf{M}odal \\textbf{S}equential\n\\textbf{R}ecommendation (\\textbf{MMSR}) framework remains a formidable\nchallenge. We systematically summarize the existing multi-modal related SR\nmethods and distill the essence into four core components: visual encoder, text\nencoder, multimodal fusion module, and sequential architecture. Along these\ndimensions, we dissect the model designs, and answer the following\nsub-questions: First, we explore how to construct MMSR from scratch, ensuring\nits performance either on par with or exceeds existing SR methods without\ncomplex techniques. Second, we examine if MMSR can benefit from existing\nmulti-modal pre-training paradigms. Third, we assess MMSR's capability in\ntackling common challenges like cold start and domain transferring. Our\nexperiment results across four real-world recommendation scenarios demonstrate\nthe great potential ID-agnostic multi-modal sequential recommendation. Our\nframework can be found at: https://github.com/MMSR23/MMSR.\n","authors":["Youhua Li","Hanwen Du","Yongxin Ni","Yuanqi He","Junchen Fu","Xiangyan Liu","Qi Guo"],"pdf_url":"https://arxiv.org/pdf/2403.17372v4.pdf","comment":"We are requesting to withdraw the paper due to a significant\n methodological error discovered in the experimental setup, specifically in\n Section 4.3. This error affects the validity of the results and conclusions\n drawn from the study. We intend to address these issues and submit a\n corrected version in the future"},{"id":"http://arxiv.org/abs/2409.07238v1","updated":"2024-09-11T12:51:41Z","published":"2024-09-11T12:51:41Z","title":"Diff-VPS: Video Polyp Segmentation via a Multi-task Diffusion Network\n with Adversarial Temporal Reasoning","summary":" Diffusion Probabilistic Models have recently attracted significant attention\nin the community of computer vision due to their outstanding performance.\nHowever, while a substantial amount of diffusion-based research has focused on\ngenerative tasks, no work introduces diffusion models to advance the results of\npolyp segmentation in videos, which is frequently challenged by polyps' high\ncamouflage and redundant temporal cues.In this paper, we present a novel\ndiffusion-based network for video polyp segmentation task, dubbed as Diff-VPS.\nWe incorporate multi-task supervision into diffusion models to promote the\ndiscrimination of diffusion models on pixel-by-pixel segmentation. This\nintegrates the contextual high-level information achieved by the joint\nclassification and detection tasks. To explore the temporal dependency,\nTemporal Reasoning Module (TRM) is devised via reasoning and reconstructing the\ntarget frame from the previous frames. We further equip TRM with a generative\nadversarial self-supervised strategy to produce more realistic frames and thus\ncapture better dynamic cues. Extensive experiments are conducted on SUN-SEG,\nand the results indicate that our proposed Diff-VPS significantly achieves\nstate-of-the-art performance. Code is available at\nhttps://github.com/lydia-yllu/Diff-VPS.\n","authors":["Yingling Lu","Yijun Yang","Zhaohu Xing","Qiong Wang","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.07238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07237v1","updated":"2024-09-11T12:48:52Z","published":"2024-09-11T12:48:52Z","title":"Negative Sampling in Recommendation: A Survey and Future Directions","summary":" Recommender systems aim to capture users' personalized preferences from the\ncast amount of user behaviors, making them pivotal in the era of information\nexplosion. However, the presence of the dynamic preference, the \"information\ncocoons\", and the inherent feedback loops in recommendation make users interact\nwith a limited number of items. Conventional recommendation algorithms\ntypically focus on the positive historical behaviors, while neglecting the\nessential role of negative feedback in user interest understanding. As a\npromising but easy-to-ignored area, negative sampling is proficients in\nrevealing the genuine negative aspect inherent in user behaviors, emerging as\nan inescapable procedure in recommendation. In this survey, we first discuss\nthe role of negative sampling in recommendation and thoroughly analyze\nchallenges that consistently impede its progress. Then, we conduct an extensive\nliterature review on the existing negative sampling strategies in\nrecommendation and classify them into five categories with their discrepant\ntechniques. Finally, we detail the insights of the tailored negative sampling\nstrategies in diverse recommendation scenarios and outline an overview of the\nprospective research directions toward which the community may engage and\nbenefit.\n","authors":["Haokai Ma","Ruobing Xie","Lei Meng","Fuli Feng","Xiaoyu Du","Xingwu Sun","Zhanhui Kang","Xiangxu Meng"],"pdf_url":"https://arxiv.org/pdf/2409.07237v1.pdf","comment":"38 pages, 9 figures; Under review"},{"id":"http://arxiv.org/abs/2409.05570v2","updated":"2024-09-11T07:51:10Z","published":"2024-09-09T12:53:06Z","title":"Rs4rs: Semantically Find Recent Publications from Top Recommendation\n System-Related Venues","summary":" Rs4rs is a web application designed to perform semantic search on recent\npapers from top conferences and journals related to Recommender Systems.\nCurrent scholarly search engine tools like Google Scholar, Semantic Scholar,\nand ResearchGate often yield broad results that fail to target the most\nrelevant high-quality publications. Moreover, manually visiting individual\nconference and journal websites is a time-consuming process that primarily\nsupports only syntactic searches. Rs4rs addresses these issues by providing a\nuser-friendly platform where researchers can input their topic of interest and\nreceive a list of recent, relevant papers from top Recommender Systems venues.\nUtilizing semantic search techniques, Rs4rs ensures that the search results are\nnot only precise and relevant but also comprehensive, capturing papers\nregardless of variations in wording. This tool significantly enhances research\nefficiency and accuracy, thereby benefitting the research community and public\nby facilitating access to high-quality, pertinent academic resources in the\nfield of Recommender Systems. Rs4rs is available at https://rs4rs.com.\n","authors":["Tri Kurniawan Wijaya","Edoardo D'Amico","Gabor Fodor","Manuel V. Loureiro"],"pdf_url":"https://arxiv.org/pdf/2409.05570v2.pdf","comment":"Accepted in ACM RecSys 2024"},{"id":"http://arxiv.org/abs/2405.02219v2","updated":"2024-09-11T07:27:51Z","published":"2024-05-03T16:25:27Z","title":"A Normative Framework for Benchmarking Consumer Fairness in Large\n Language Model Recommender System","summary":" The rapid adoption of large language models (LLMs) in recommender systems\n(RS) presents new challenges in understanding and evaluating their biases,\nwhich can result in unfairness or the amplification of stereotypes. Traditional\nfairness evaluations in RS primarily focus on collaborative filtering (CF)\nsettings, which may not fully capture the complexities of LLMs, as these models\noften inherit biases from large, unregulated data. This paper proposes a\nnormative framework to benchmark consumer fairness in LLM-powered recommender\nsystems (RecLLMs).\n We critically examine how fairness norms in classical RS fall short in\naddressing the challenges posed by LLMs. We argue that this gap can lead to\narbitrary conclusions about fairness, and we propose a more structured, formal\napproach to evaluate fairness in such systems. Our experiments on the MovieLens\ndataset on consumer fairness, using in-context learning (zero-shot vs.\nfew-shot) reveal fairness deviations in age-based recommendations, particularly\nwhen additional contextual examples are introduced (ICL-2). Statistical\nsignificance tests confirm that these deviations are not random, highlighting\nthe need for robust evaluation methods. While this work offers a preliminary\ndiscussion on a proposed normative framework, our hope is that it could provide\na formal, principled approach for auditing and mitigating bias in RecLLMs. The\ncode and dataset used for this work will be shared at \"gihub-anonymized\".\n","authors":["Yashar Deldjoo","Fatemeh Nazary"],"pdf_url":"https://arxiv.org/pdf/2405.02219v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07033v1","updated":"2024-09-11T06:03:02Z","published":"2024-09-11T06:03:02Z","title":"E-commerce Webpage Recommendation Scheme Base on Semantic Mining and\n Neural Networks","summary":" In e-commerce websites, web mining web page recommendation technology has\nbeen widely used. However, recommendation solutions often cannot meet the\nactual application needs of online shopping users. To address this problem,\nthis paper proposes an e-commerce web page recommendation solution that\ncombines semantic web mining and BP neural networks. First, the web logs of\nuser searches are processed, and 5 features are extracted: content priority,\ntime consumption priority, online shopping users' explicit/implicit feedback on\nthe website, recommendation semantics and input deviation amount. Then, these\nfeatures are used as input features of the BP neural network to classify and\nidentify the priority of the final output web page. Finally, the web pages are\nsorted according to priority and recommended to users. This project uses book\nsales webpages as samples for experiments. The results show that this solution\ncan quickly and accurately identify the webpages required by users.\n","authors":["Wenchao Zhao","Xiaoyi Liu","Ruilin Xu","Lingxi Xiao","Muqing Li"],"pdf_url":"https://arxiv.org/pdf/2409.07033v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2409.01137"},{"id":"http://arxiv.org/abs/2409.05878v2","updated":"2024-09-11T04:47:52Z","published":"2024-08-25T12:12:08Z","title":"CF-KAN: Kolmogorov-Arnold Network-based Collaborative Filtering to\n Mitigate Catastrophic Forgetting in Recommender Systems","summary":" Collaborative filtering (CF) remains essential in recommender systems,\nleveraging user--item interactions to provide personalized recommendations.\nMeanwhile, a number of CF techniques have evolved into sophisticated model\narchitectures based on multi-layer perceptrons (MLPs). However, MLPs often\nsuffer from catastrophic forgetting, and thus lose previously acquired\nknowledge when new information is learned, particularly in dynamic environments\nrequiring continual learning. To tackle this problem, we propose CF-KAN, a new\nCF method utilizing Kolmogorov-Arnold networks (KANs). By learning nonlinear\nfunctions on the edge level, KANs are more robust to the catastrophic\nforgetting problem than MLPs. Built upon a KAN-based autoencoder, CF-KAN is\ndesigned in the sense of effectively capturing the intricacies of sparse\nuser--item interactions and retaining information from previous data instances.\nDespite its simplicity, our extensive experiments demonstrate 1) CF-KAN's\nsuperiority over state-of-the-art methods in recommendation accuracy, 2)\nCF-KAN's resilience to catastrophic forgetting, underscoring its effectiveness\nin both static and dynamic recommendation scenarios, and 3) CF-KAN's edge-level\ninterpretation facilitating the explainability of recommendations.\n","authors":["Jin-Duk Park","Kyung-Min Kim","Won-Yong Shin"],"pdf_url":"https://arxiv.org/pdf/2409.05878v2.pdf","comment":"9 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.05405v2","updated":"2024-09-11T02:44:52Z","published":"2024-09-09T08:06:50Z","title":"A Survey of Multimodal Composite Editing and Retrieval","summary":" In the real world, where information is abundant and diverse across different\nmodalities, understanding and utilizing various data types to improve retrieval\nsystems is a key focus of research. Multimodal composite retrieval integrates\ndiverse modalities such as text, image and audio, etc. to provide more\naccurate, personalized, and contextually relevant results. To facilitate a\ndeeper understanding of this promising direction, this survey explores\nmultimodal composite editing and retrieval in depth, covering image-text\ncomposite editing, image-text composite retrieval, and other multimodal\ncomposite retrieval. In this survey, we systematically organize the application\nscenarios, methods, benchmarks, experiments, and future directions. Multimodal\nlearning is a hot topic in large model era, and have also witnessed some\nsurveys in multimodal learning and vision-language models with transformers\npublished in the PAMI journal. To the best of our knowledge, this survey is the\nfirst comprehensive review of the literature on multimodal composite retrieval,\nwhich is a timely complement of multimodal fusion to existing reviews. To help\nreaders' quickly track this field, we build the project page for this survey,\nwhich can be found at\nhttps://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval.\n","authors":["Suyan Li","Fuxiang Huang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05405v2.pdf","comment":"20 pages, 3 figures, and 11 tables"},{"id":"http://arxiv.org/abs/2405.17795v3","updated":"2024-09-11T02:07:20Z","published":"2024-05-28T03:45:34Z","title":"Dataset Regeneration for Sequential Recommendation","summary":" The sequential recommender (SR) system is a crucial component of modern\nrecommender systems, as it aims to capture the evolving preferences of users.\nSignificant efforts have been made to enhance the capabilities of SR systems.\nThese methods typically follow the model-centric paradigm, which involves\ndeveloping effective models based on fixed datasets. However, this approach\noften overlooks potential quality issues and flaws inherent in the data. Driven\nby the potential of data-centric AI, we propose a novel data-centric paradigm\nfor developing an ideal training dataset using a model-agnostic dataset\nregeneration framework called DR4SR. This framework enables the regeneration of\na dataset with exceptional cross-architecture generalizability. Additionally,\nwe introduce the DR4SR+ framework, which incorporates a model-aware dataset\npersonalizer to tailor the regenerated dataset specifically for a target model.\nTo demonstrate the effectiveness of the data-centric paradigm, we integrate our\nframework with various model-centric methods and observe significant\nperformance improvements across four widely adopted datasets. Furthermore, we\nconduct in-depth analyses to explore the potential of the data-centric paradigm\nand provide valuable insights. The code can be found at\nhttps://github.com/USTC-StarTeam/DR4SR.\n","authors":["Mingjia Yin","Hao Wang","Wei Guo","Yong Liu","Suojuan Zhang","Sirui Zhao","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.17795v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.07454v1","updated":"2024-09-11T17:59:02Z","published":"2024-09-11T17:59:02Z","title":"DreamMesh: Jointly Manipulating and Texturing Triangle Meshes for\n Text-to-3D Generation","summary":" Learning radiance fields (NeRF) with powerful 2D diffusion models has\ngarnered popularity for text-to-3D generation. Nevertheless, the implicit 3D\nrepresentations of NeRF lack explicit modeling of meshes and textures over\nsurfaces, and such surface-undefined way may suffer from the issues, e.g.,\nnoisy surfaces with ambiguous texture details or cross-view inconsistency. To\nalleviate this, we present DreamMesh, a novel text-to-3D architecture that\npivots on well-defined surfaces (triangle meshes) to generate high-fidelity\nexplicit 3D model. Technically, DreamMesh capitalizes on a distinctive\ncoarse-to-fine scheme. In the coarse stage, the mesh is first deformed by\ntext-guided Jacobians and then DreamMesh textures the mesh with an interlaced\nuse of 2D diffusion models in a tuning free manner from multiple viewpoints. In\nthe fine stage, DreamMesh jointly manipulates the mesh and refines the texture\nmap, leading to high-quality triangle meshes with high-fidelity textured\nmaterials. Extensive experiments demonstrate that DreamMesh significantly\noutperforms state-of-the-art text-to-3D methods in faithfully generating 3D\ncontent with richer textual details and enhanced geometry. Our project page is\navailable at https://dreammesh.github.io.\n","authors":["Haibo Yang","Yang Chen","Yingwei Pan","Ting Yao","Zhineng Chen","Zuxuan Wu","Yu-Gang Jiang","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2409.07454v1.pdf","comment":"ECCV 2024. Project page is available at\n \\url{https://dreammesh.github.io}"},{"id":"http://arxiv.org/abs/2409.07452v1","updated":"2024-09-11T17:58:57Z","published":"2024-09-11T17:58:57Z","title":"Hi3D: Pursuing High-Resolution Image-to-3D Generation with Video\n Diffusion Models","summary":" Despite having tremendous progress in image-to-3D generation, existing\nmethods still struggle to produce multi-view consistent images with\nhigh-resolution textures in detail, especially in the paradigm of 2D diffusion\nthat lacks 3D awareness. In this work, we present High-resolution Image-to-3D\nmodel (Hi3D), a new video diffusion based paradigm that redefines a single\nimage to multi-view images as 3D-aware sequential image generation (i.e.,\norbital video generation). This methodology delves into the underlying temporal\nconsistency knowledge in video diffusion model that generalizes well to\ngeometry consistency across multiple views in 3D generation. Technically, Hi3D\nfirst empowers the pre-trained video diffusion model with 3D-aware prior\n(camera pose condition), yielding multi-view images with low-resolution texture\ndetails. A 3D-aware video-to-video refiner is learnt to further scale up the\nmulti-view images with high-resolution texture details. Such high-resolution\nmulti-view images are further augmented with novel views through 3D Gaussian\nSplatting, which are finally leveraged to obtain high-fidelity meshes via 3D\nreconstruction. Extensive experiments on both novel view synthesis and single\nview reconstruction demonstrate that our Hi3D manages to produce superior\nmulti-view consistency images with highly-detailed textures. Source code and\ndata are available at \\url{https://github.com/yanghb22-fdu/Hi3D-Official}.\n","authors":["Haibo Yang","Yang Chen","Yingwei Pan","Ting Yao","Zhineng Chen","Chong-Wah Ngo","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2409.07452v1.pdf","comment":"ACM Multimedia 2024. Source code is available at\n \\url{https://github.com/yanghb22-fdu/Hi3D-Official}"},{"id":"http://arxiv.org/abs/2409.07451v1","updated":"2024-09-11T17:58:50Z","published":"2024-09-11T17:58:50Z","title":"FreeEnhance: Tuning-Free Image Enhancement via Content-Consistent\n Noising-and-Denoising Process","summary":" The emergence of text-to-image generation models has led to the recognition\nthat image enhancement, performed as post-processing, would significantly\nimprove the visual quality of the generated images. Exploring diffusion models\nto enhance the generated images nevertheless is not trivial and necessitates to\ndelicately enrich plentiful details while preserving the visual appearance of\nkey content in the original image. In this paper, we propose a novel framework,\nnamely FreeEnhance, for content-consistent image enhancement using the\noff-the-shelf image diffusion models. Technically, FreeEnhance is a two-stage\nprocess that firstly adds random noise to the input image and then capitalizes\non a pre-trained image diffusion model (i.e., Latent Diffusion Models) to\ndenoise and enhance the image details. In the noising stage, FreeEnhance is\ndevised to add lighter noise to the region with higher frequency to preserve\nthe high-frequent patterns (e.g., edge, corner) in the original image. In the\ndenoising stage, we present three target properties as constraints to\nregularize the predicted noise, enhancing images with high acutance and high\nvisual quality. Extensive experiments conducted on the HPDv2 dataset\ndemonstrate that our FreeEnhance outperforms the state-of-the-art image\nenhancement models in terms of quantitative metrics and human preference. More\nremarkably, FreeEnhance also shows higher human preference compared to the\ncommercial image enhancement solution of Magnific AI.\n","authors":["Yang Luo","Yiheng Zhang","Zhaofan Qiu","Ting Yao","Zhineng Chen","Yu-Gang Jiang","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2409.07451v1.pdf","comment":"ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2409.07450v1","updated":"2024-09-11T17:56:48Z","published":"2024-09-11T17:56:48Z","title":"VMAS: Video-to-Music Generation via Semantic Alignment in Web Music\n Videos","summary":" We present a framework for learning to generate background music from video\ninputs. Unlike existing works that rely on symbolic musical annotations, which\nare limited in quantity and diversity, our method leverages large-scale web\nvideos accompanied by background music. This enables our model to learn to\ngenerate realistic and diverse music. To accomplish this goal, we develop a\ngenerative video-music Transformer with a novel semantic video-music alignment\nscheme. Our model uses a joint autoregressive and contrastive learning\nobjective, which encourages the generation of music aligned with high-level\nvideo content. We also introduce a novel video-beat alignment scheme to match\nthe generated music beats with the low-level motions in the video. Lastly, to\ncapture fine-grained visual cues in a video needed for realistic background\nmusic generation, we introduce a new temporal video encoder architecture,\nallowing us to efficiently process videos consisting of many densely sampled\nframes. We train our framework on our newly curated DISCO-MV dataset,\nconsisting of 2.2M video-music samples, which is orders of magnitude larger\nthan any prior datasets used for video music generation. Our method outperforms\nexisting approaches on the DISCO-MV and MusicCaps datasets according to various\nmusic generation evaluation metrics, including human evaluation. Results are\navailable at https://genjib.github.io/project_page/VMAs/index.html\n","authors":["Yan-Bo Lin","Yu Tian","Linjie Yang","Gedas Bertasius","Heng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.07450v1.pdf","comment":"Project Page: https://genjib.github.io/project_page/VMAs/index.html"},{"id":"http://arxiv.org/abs/2409.05606v2","updated":"2024-09-11T12:06:57Z","published":"2024-09-09T13:39:47Z","title":"CustomContrast: A Multilevel Contrastive Perspective For Subject-Driven\n Text-to-Image Customization","summary":" Subject-driven text-to-image (T2I) customization has drawn significant\ninterest in academia and industry. This task enables pre-trained models to\ngenerate novel images based on unique subjects. Existing studies adopt a\nself-reconstructive perspective, focusing on capturing all details of a single\nimage, which will misconstrue the specific image's irrelevant attributes (e.g.,\nview, pose, and background) as the subject intrinsic attributes. This\nmisconstruction leads to both overfitting or underfitting of irrelevant and\nintrinsic attributes of the subject, i.e., these attributes are\nover-represented or under-represented simultaneously, causing a trade-off\nbetween similarity and controllability. In this study, we argue an ideal\nsubject representation can be achieved by a cross-differential perspective,\ni.e., decoupling subject intrinsic attributes from irrelevant attributes via\ncontrastive learning, which allows the model to focus more on intrinsic\nattributes through intra-consistency (features of the same subject are\nspatially closer) and inter-distinctiveness (features of different subjects\nhave distinguished differences). Specifically, we propose CustomContrast, a\nnovel framework, which includes a Multilevel Contrastive Learning (MCL)\nparadigm and a Multimodal Feature Injection (MFI) Encoder. The MCL paradigm is\nused to extract intrinsic features of subjects from high-level semantics to\nlow-level appearance through crossmodal semantic contrastive learning and\nmultiscale appearance contrastive learning. To facilitate contrastive learning,\nwe introduce the MFI encoder to capture cross-modal representations. Extensive\nexperiments show the effectiveness of CustomContrast in subject similarity and\ntext controllability.\n","authors":["Nan Chen","Mengqi Huang","Zhuowei Chen","Yang Zheng","Lei Zhang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2409.05606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13621v6","updated":"2024-09-11T06:13:30Z","published":"2024-04-21T11:21:27Z","title":"Attack on Scene Flow using Point Clouds","summary":" Deep neural networks have made significant advancements in accurately\nestimating scene flow using point clouds, which is vital for many applications\nlike video analysis, action recognition, and navigation. The robustness of\nthese techniques, however, remains a concern, particularly in the face of\nadversarial attacks that have been proven to deceive state-of-the-art deep\nneural networks in many domains. Surprisingly, the robustness of scene flow\nnetworks against such attacks has not been thoroughly investigated. To address\nthis problem, the proposed approach aims to bridge this gap by introducing\nadversarial white-box attacks specifically tailored for scene flow networks.\nExperimental results show that the generated adversarial examples obtain up to\n33.7 relative degradation in average end-point error on the KITTI and\nFlyingThings3D datasets. The study also reveals the significant impact that\nattacks targeting point clouds in only one dimension or color channel have on\naverage end-point error. Analyzing the success and failure of these attacks on\nthe scene flow networks and their 2D optical flow network variants shows a\nhigher vulnerability for the optical flow networks. Code is available at\nhttps://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git.\n","authors":["Haniyeh Ehsani Oskouie","Mohammad-Shahram Moin","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2404.13621v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15757v2","updated":"2024-09-11T03:27:53Z","published":"2024-05-24T17:53:06Z","title":"Looking Backward: Streaming Video-to-Video Translation with Feature\n Banks","summary":" This paper introduces StreamV2V, a diffusion model that achieves real-time\nstreaming video-to-video (V2V) translation with user prompts. Unlike prior V2V\nmethods using batches to process limited frames, we opt to process frames in a\nstreaming fashion, to support unlimited frames. At the heart of StreamV2V lies\na backward-looking principle that relates the present to the past. This is\nrealized by maintaining a feature bank, which archives information from past\nframes. For incoming frames, StreamV2V extends self-attention to include banked\nkeys and values and directly fuses similar past features into the output. The\nfeature bank is continually updated by merging stored and new features, making\nit compact but informative. StreamV2V stands out for its adaptability and\nefficiency, seamlessly integrating with image diffusion models without\nfine-tuning. It can run 20 FPS on one A100 GPU, being 15x, 46x, 108x, and 158x\nfaster than FlowVid, CoDeF, Rerender, and TokenFlow, respectively. Quantitative\nmetrics and user studies confirm StreamV2V's exceptional ability to maintain\ntemporal consistency.\n","authors":["Feng Liang","Akio Kodaira","Chenfeng Xu","Masayoshi Tomizuka","Kurt Keutzer","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2405.15757v2.pdf","comment":"Project page: https://jeff-liangf.github.io/projects/streamv2v"},{"id":"http://arxiv.org/abs/2409.05405v2","updated":"2024-09-11T02:44:52Z","published":"2024-09-09T08:06:50Z","title":"A Survey of Multimodal Composite Editing and Retrieval","summary":" In the real world, where information is abundant and diverse across different\nmodalities, understanding and utilizing various data types to improve retrieval\nsystems is a key focus of research. Multimodal composite retrieval integrates\ndiverse modalities such as text, image and audio, etc. to provide more\naccurate, personalized, and contextually relevant results. To facilitate a\ndeeper understanding of this promising direction, this survey explores\nmultimodal composite editing and retrieval in depth, covering image-text\ncomposite editing, image-text composite retrieval, and other multimodal\ncomposite retrieval. In this survey, we systematically organize the application\nscenarios, methods, benchmarks, experiments, and future directions. Multimodal\nlearning is a hot topic in large model era, and have also witnessed some\nsurveys in multimodal learning and vision-language models with transformers\npublished in the PAMI journal. To the best of our knowledge, this survey is the\nfirst comprehensive review of the literature on multimodal composite retrieval,\nwhich is a timely complement of multimodal fusion to existing reviews. To help\nreaders' quickly track this field, we build the project page for this survey,\nwhich can be found at\nhttps://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval.\n","authors":["Suyan Li","Fuxiang Huang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05405v2.pdf","comment":"20 pages, 3 figures, and 11 tables"}]},"2024-09-10T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.06916v1","updated":"2024-09-10T23:58:27Z","published":"2024-09-10T23:58:27Z","title":"Interactive Counterfactual Exploration of Algorithmic Harms in\n Recommender Systems","summary":" Recommender systems have become integral to digital experiences, shaping user\ninteractions and preferences across various platforms. Despite their widespread\nuse, these systems often suffer from algorithmic biases that can lead to unfair\nand unsatisfactory user experiences. This study introduces an interactive tool\ndesigned to help users comprehend and explore the impacts of algorithmic harms\nin recommender systems. By leveraging visualizations, counterfactual\nexplanations, and interactive modules, the tool allows users to investigate how\nbiases such as miscalibration, stereotypes, and filter bubbles affect their\nrecommendations. Informed by in-depth user interviews, this tool benefits both\ngeneral users and researchers by increasing transparency and offering\npersonalized impact assessments, ultimately fostering a better understanding of\nalgorithmic biases and contributing to more equitable recommendation outcomes.\nThis work provides valuable insights for future research and practical\napplications in mitigating bias and enhancing fairness in machine learning\nalgorithms.\n","authors":["Yongsu Ahn","Quinn K Wolter","Jonilyn Dick","Janet Dick","Yu-Ru Lin"],"pdf_url":"https://arxiv.org/pdf/2409.06916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06793v1","updated":"2024-09-10T18:02:51Z","published":"2024-09-10T18:02:51Z","title":"Adversarial Attacks to Multi-Modal Models","summary":" Multi-modal models have gained significant attention due to their powerful\ncapabilities. These models effectively align embeddings across diverse data\nmodalities, showcasing superior performance in downstream tasks compared to\ntheir unimodal counterparts. Recent study showed that the attacker can\nmanipulate an image or audio file by altering it in such a way that its\nembedding matches that of an attacker-chosen targeted input, thereby deceiving\ndownstream models. However, this method often underperforms due to inherent\ndisparities in data from different modalities. In this paper, we introduce\nCrossFire, an innovative approach to attack multi-modal models. CrossFire\nbegins by transforming the targeted input chosen by the attacker into a format\nthat matches the modality of the original image or audio file. We then\nformulate our attack as an optimization problem, aiming to minimize the angular\ndeviation between the embeddings of the transformed input and the modified\nimage or audio file. Solving this problem determines the perturbations to be\nadded to the original media. Our extensive experiments on six real-world\nbenchmark datasets reveal that CrossFire can significantly manipulate\ndownstream tasks, surpassing existing attacks. Additionally, we evaluate six\ndefensive strategies against CrossFire, finding that current defenses are\ninsufficient to counteract our CrossFire.\n","authors":["Zhihao Dou","Xin Hu","Haibo Yang","Zhuqing Liu","Minghong Fang"],"pdf_url":"https://arxiv.org/pdf/2409.06793v1.pdf","comment":"To appear in the ACM Workshop on Large AI Systems and Models with\n Privacy and Safety Analysis 2024 (LAMPS '24)"},{"id":"http://arxiv.org/abs/2409.04667v2","updated":"2024-09-10T17:56:52Z","published":"2024-09-07T00:46:58Z","title":"QueryBuilder: Human-in-the-Loop Query Development for Information\n Retrieval","summary":" Frequently, users of an Information Retrieval (IR) system start with an\noverarching information need (a.k.a., an analytic task) and proceed to define\nfiner-grained queries covering various important aspects (i.e., sub-topics) of\nthat analytic task. We present a novel, interactive system called\n$\\textit{QueryBuilder}$, which allows a novice, English-speaking user to create\nqueries with a small amount of effort, through efficient exploration of an\nEnglish development corpus in order to rapidly develop cross-lingual\ninformation retrieval queries corresponding to the user's information needs.\nQueryBuilder performs near real-time retrieval of documents based on\nuser-entered search terms; the user looks through the retrieved documents and\nmarks sentences as relevant to the information needed. The marked sentences are\nused by the system as additional information in query formation and refinement:\nquery terms (and, optionally, event features, which capture event $'triggers'$\n(indicator terms) and agent/patient roles) are appropriately weighted, and a\nneural-based system, which better captures textual meaning, retrieves other\nrelevant content. The process of retrieval and marking is repeated as many\ntimes as desired, giving rise to increasingly refined queries in each\niteration. The final product is a fine-grained query used in Cross-Lingual\nInformation Retrieval (CLIR). Our experiments using analytic tasks and requests\nfrom the IARPA BETTER IR datasets show that with a small amount of effort (at\nmost 10 minutes per sub-topic), novice users can form $\\textit{useful}$\nfine-grained queries including in languages they don't understand. QueryBuilder\nalso provides beneficial capabilities to the traditional corpus exploration and\nquery formation process. A demonstration video is released at\nhttps://vimeo.com/734795835\n","authors":["Hemanth Kandula","Damianos Karakos","Haoling Qiu","Benjamin Rozonoyer","Ian Soboroff","Lee Tarlin","Bonan Min"],"pdf_url":"https://arxiv.org/pdf/2409.04667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06638v1","updated":"2024-09-10T16:48:05Z","published":"2024-09-10T16:48:05Z","title":"Critical Features Tracking on Triangulated Irregular Networks by a\n Scale-Space Method","summary":" The scale-space method is a well-established framework that constructs a\nhierarchical representation of an input signal and facilitates coarse-to-fine\nvisual reasoning. Considering the terrain elevation function as the input\nsignal, the scale-space method can identify and track significant topographic\nfeatures across different scales. The number of scales a feature persists,\ncalled its life span, indicates the importance of that feature. In this way,\nimportant topographic features of a landscape can be selected, which are useful\nfor many applications, including cartography, nautical charting, and land-use\nplanning. The scale-space methods developed for terrain data use gridded\nDigital Elevation Models (DEMs) to represent the terrain. However, gridded DEMs\nlack the flexibility to adapt to the irregular distribution of input data and\nthe varied topological complexity of different regions. Instead, Triangulated\nIrregular Networks (TINs) can be directly generated from irregularly\ndistributed point clouds and accurately preserve important features. In this\nwork, we introduce a novel scale-space analysis pipeline for TINs, addressing\nthe multiple challenges in extending grid-based scale-space methods to TINs.\nOur pipeline can efficiently identify and track topologically important\nfeatures on TINs. Moreover, it is capable of analyzing terrains with irregular\nboundaries, which poses challenges for grid-based methods. Comprehensive\nexperiments show that, compared to grid-based methods, our TIN-based pipeline\nis more efficient, accurate, and has better resolution robustness.\n","authors":["Haoan Feng","Yunting Song","Leila De Floriani"],"pdf_url":"https://arxiv.org/pdf/2409.06638v1.pdf","comment":"13pages, ACM SIGSPATIAL 2024"},{"id":"http://arxiv.org/abs/2409.05526v2","updated":"2024-09-10T16:46:10Z","published":"2024-09-09T11:35:35Z","title":"RBoard: A Unified Platform for Reproducible and Reusable Recommender\n System Benchmarks","summary":" Recommender systems research lacks standardized benchmarks for\nreproducibility and algorithm comparisons. We introduce RBoard, a novel\nframework addressing these challenges by providing a comprehensive platform for\nbenchmarking diverse recommendation tasks, including CTR prediction, Top-N\nrecommendation, and others. RBoard's primary objective is to enable fully\nreproducible and reusable experiments across these scenarios. The framework\nevaluates algorithms across multiple datasets within each task, aggregating\nresults for a holistic performance assessment. It implements standardized\nevaluation protocols, ensuring consistency and comparability. To facilitate\nreproducibility, all user-provided code can be easily downloaded and executed,\nallowing researchers to reliably replicate studies and build upon previous\nwork. By offering a unified platform for rigorous, reproducible evaluation\nacross various recommendation scenarios, RBoard aims to accelerate progress in\nthe field and establish a new standard for recommender systems benchmarking in\nboth academia and industry. The platform is available at https://rboard.org and\nthe demo video can be found at https://bit.ly/rboard-demo.\n","authors":["Xinyang Shao","Edoardo D'Amico","Gabor Fodor","Tri Kurniawan Wijaya"],"pdf_url":"https://arxiv.org/pdf/2409.05526v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08921v2","updated":"2024-09-10T15:38:56Z","published":"2024-08-15T12:20:24Z","title":"Graph Retrieval-Augmented Generation: A Survey","summary":" Recently, Retrieval-Augmented Generation (RAG) has achieved remarkable\nsuccess in addressing the challenges of Large Language Models (LLMs) without\nnecessitating retraining. By referencing an external knowledge base, RAG\nrefines LLM outputs, effectively mitigating issues such as ``hallucination'',\nlack of domain-specific knowledge, and outdated information. However, the\ncomplex structure of relationships among different entities in databases\npresents challenges for RAG systems. In response, GraphRAG leverages structural\ninformation across entities to enable more precise and comprehensive retrieval,\ncapturing relational knowledge and facilitating more accurate, context-aware\nresponses. Given the novelty and potential of GraphRAG, a systematic review of\ncurrent technologies is imperative. This paper provides the first comprehensive\noverview of GraphRAG methodologies. We formalize the GraphRAG workflow,\nencompassing Graph-Based Indexing, Graph-Guided Retrieval, and Graph-Enhanced\nGeneration. We then outline the core technologies and training methods at each\nstage. Additionally, we examine downstream tasks, application domains,\nevaluation methodologies, and industrial use cases of GraphRAG. Finally, we\nexplore future research directions to inspire further inquiries and advance\nprogress in the field. In order to track recent progress in this field, we set\nup a repository at \\url{https://github.com/pengboci/GraphRAG-Survey}.\n","authors":["Boci Peng","Yun Zhu","Yongchao Liu","Xiaohe Bo","Haizhou Shi","Chuntao Hong","Yan Zhang","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2408.08921v2.pdf","comment":"Ongoing work. Compared to the first version, several references have\n been added and a GitHub repository link has been provided"},{"id":"http://arxiv.org/abs/2409.07500v1","updated":"2024-09-10T15:24:13Z","published":"2024-09-10T15:24:13Z","title":"DV-FSR: A Dual-View Target Attack Framework for Federated Sequential\n Recommendation","summary":" Federated recommendation (FedRec) preserves user privacy by enabling\ndecentralized training of personalized models, but this architecture is\ninherently vulnerable to adversarial attacks. Significant research has been\nconducted on targeted attacks in FedRec systems, motivated by commercial and\nsocial influence considerations. However, much of this work has largely\noverlooked the differential robustness of recommendation models. Moreover, our\nempirical findings indicate that existing targeted attack methods achieve only\nlimited effectiveness in Federated Sequential Recommendation (FSR) tasks.\nDriven by these observations, we focus on investigating targeted attacks in FSR\nand propose a novel dualview attack framework, named DV-FSR. This attack method\nuniquely combines a sampling-based explicit strategy with a contrastive\nlearning-based implicit gradient strategy to orchestrate a coordinated attack.\nAdditionally, we introduce a specific defense mechanism tailored for targeted\nattacks in FSR, aiming to evaluate the mitigation effects of the attack method\nwe proposed. Extensive experiments validate the effectiveness of our proposed\napproach on representative sequential models.\n","authors":["Qitao Qin","Yucong Luo","Mingyue Cheng","Qingyang Mao","Chenyi Lei"],"pdf_url":"https://arxiv.org/pdf/2409.07500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04614v3","updated":"2024-09-10T14:37:00Z","published":"2024-05-07T18:58:32Z","title":"Multi-Margin Cosine Loss: Proposal and Application in Recommender\n Systems","summary":" Recommender systems guide users through vast amounts of information by\nsuggesting items based on their predicted preferences. Collaborative\nfiltering-based deep learning techniques have regained popularity due to their\nstraightforward nature, relying only on user-item interactions. Typically,\nthese systems consist of three main components: an interaction module, a loss\nfunction, and a negative sampling strategy. Initially, researchers focused on\nenhancing performance by developing complex interaction modules. However, there\nhas been a recent shift toward refining loss functions and negative sampling\nstrategies. This shift has led to an increased interest in contrastive\nlearning, which pulls similar pairs closer while pushing dissimilar ones apart.\nContrastive learning may bring challenges like high memory demands and\nunder-utilization of some negative samples. The proposed Multi-Margin Cosine\nLoss (MMCL) addresses these challenges by introducing multiple margins and\nvarying weights for negative samples. It efficiently utilizes not only the\nhardest negatives but also other non-trivial negatives, offers a simpler yet\neffective loss function that outperforms more complex methods, especially when\nresources are limited. Experiments on two well-known datasets demonstrated that\nMMCL achieved up to a 20\\% performance improvement compared to a baseline loss\nfunction when fewer number of negative samples are used.\n","authors":["Makbule Gulcin Ozsoy"],"pdf_url":"https://arxiv.org/pdf/2405.04614v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07606v2","updated":"2024-09-10T13:39:28Z","published":"2023-09-14T11:13:36Z","title":"Zero-shot Audio Topic Reranking using Large Language Models","summary":" Multimodal Video Search by Examples (MVSE) investigates using video clips as\nthe query term for information retrieval, rather than the more traditional text\nquery. This enables far richer search modalities such as images, speaker,\ncontent, topic, and emotion. A key element for this process is highly rapid and\nflexible search to support large archives, which in MVSE is facilitated by\nrepresenting video attributes with embeddings. This work aims to compensate for\nany performance loss from this rapid archive search by examining reranking\napproaches. In particular, zero-shot reranking methods using large language\nmodels (LLMs) are investigated as these are applicable to any video archive\naudio content. Performance is evaluated for topic-based retrieval on a publicly\navailable video archive, the BBC Rewind corpus. Results demonstrate that\nreranking significantly improves retrieval ranking without requiring any\ntask-specific in-domain training data. Furthermore, three sources of\ninformation (ASR transcriptions, automatic summaries and synopses) as input for\nLLM reranking were compared. To gain a deeper understanding and further\ninsights into the performance differences and limitations of these text\nsources, we employ a fact-checking approach to analyse the information\nconsistency among them.\n","authors":["Mengjie Qian","Rao Ma","Adian Liusie","Erfan Loweimi","Kate M. Knill","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2309.07606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06464v1","updated":"2024-09-10T12:46:23Z","published":"2024-09-10T12:46:23Z","title":"Operational Advice for Dense and Sparse Retrievers: HNSW, Flat, or\n Inverted Indexes?","summary":" Practitioners working on dense retrieval today face a bewildering number of\nchoices. Beyond selecting the embedding model, another consequential choice is\nthe actual implementation of nearest-neighbor vector search. While best\npractices recommend HNSW indexes, flat vector indexes with brute-force search\nrepresent another viable option, particularly for smaller corpora and for rapid\nprototyping. In this paper, we provide experimental results on the BEIR dataset\nusing the open-source Lucene search library that explicate the tradeoffs\nbetween HNSW and flat indexes (including quantized variants) from the\nperspectives of indexing time, query evaluation performance, and retrieval\nquality. With additional comparisons between dense and sparse retrievers, our\nresults provide guidance for today's search practitioner in understanding the\ndesign space of dense and sparse retrievers. To our knowledge, we are the first\nto provide operational advice supported by empirical experiments in this\nregard.\n","authors":["Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2409.06464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00860v2","updated":"2024-09-10T10:52:30Z","published":"2024-09-01T22:33:29Z","title":"A Counterfactual Explanation Framework for Retrieval Models","summary":" Explainability has become a crucial concern in today's world, aiming to\nenhance transparency in machine learning and deep learning models. Information\nretrieval is no exception to this trend. In existing literature on\nexplainability of information retrieval, the emphasis has predominantly been on\nillustrating the concept of relevance concerning a retrieval model. The\nquestions addressed include why a document is relevant to a query, why one\ndocument exhibits higher relevance than another, or why a specific set of\ndocuments is deemed relevant for a query.\n However, limited attention has been given to understanding why a particular\ndocument is considered non-relevant to a query with respect to a retrieval\nmodel. In an effort to address this gap, our work focus on the question of what\nterms need to be added within a document to improve its ranking. This in turn\nanswers the question of which words played a role in not being favored by a\nretrieval model for a particular query. We use an optimization framework to\nsolve the above-mentioned research problem. % To the best of our knowledge, we\nmark the first attempt to tackle this specific counterfactual problem. Our\nexperiments show the effectiveness of our proposed approach in predicting\ncounterfactuals for both statistical (e.g. BM25) and deep-learning-based models\n(e.g. DRMM, DSSM, ColBERT).\n","authors":["Bhavik Chandna","Procheta Sen"],"pdf_url":"https://arxiv.org/pdf/2409.00860v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06377v1","updated":"2024-09-10T09:58:55Z","published":"2024-09-10T09:58:55Z","title":"Enhancing Sequential Recommendations through Multi-Perspective\n Reflections and Iteration","summary":" Sequence recommendation (SeqRec) aims to predict the next item a user will\ninteract with by understanding user intentions and leveraging collaborative\nfiltering information. Large language models (LLMs) have shown great promise in\nrecommendation tasks through prompt-based, fixed reflection libraries, and\nfine-tuning techniques. However, these methods face challenges, including lack\nof supervision, inability to optimize reflection sources, inflexibility to\ndiverse user needs, and high computational costs. Despite promising results,\ncurrent studies primarily focus on reflections of users' explicit preferences\n(e.g., item titles) while neglecting implicit preferences (e.g., brands) and\ncollaborative filtering information. This oversight hinders the capture of\npreference shifts and dynamic user behaviors. Additionally, existing approaches\nlack mechanisms for reflection evaluation and iteration, often leading to\nsuboptimal recommendations. To address these issues, we propose the Mixture of\nREflectors (MoRE) framework, designed to model and learn dynamic user\npreferences in SeqRec. Specifically, MoRE introduces three reflectors for\ngenerating LLM-based reflections on explicit preferences, implicit preferences,\nand collaborative signals. Each reflector incorporates a self-improving\nstrategy, termed refining-and-iteration, to evaluate and iteratively update\nreflections. Furthermore, a meta-reflector employs a contextual bandit\nalgorithm to select the most suitable expert and corresponding reflections for\neach user's recommendation, effectively capturing dynamic preferences.\nExtensive experiments on three real-world datasets demonstrate that MoRE\nconsistently outperforms state-of-the-art methods, requiring less training time\nand GPU memory compared to other LLM-based approaches in SeqRec.\n","authors":["Weicong Qin","Yi Xu","Weijie Yu","Chenglei Shen","Xiao Zhang","Ming He","Jianping Fan","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2409.06377v1.pdf","comment":"First 3 authors contributes equally to this work"},{"id":"http://arxiv.org/abs/2409.06297v1","updated":"2024-09-10T07:51:53Z","published":"2024-09-10T07:51:53Z","title":"User Preferences for Large Language Model versus Template-Based\n Explanations of Movie Recommendations: A Pilot Study","summary":" Recommender systems have become integral to our digital experiences, from\nonline shopping to streaming platforms. Still, the rationale behind their\nsuggestions often remains opaque to users. While some systems employ a\ngraph-based approach, offering inherent explainability through paths\nassociating recommended items and seed items, non-experts could not easily\nunderstand these explanations. A popular alternative is to convert graph-based\nexplanations into textual ones using a template and an algorithm, which we\ndenote here as ''template-based'' explanations. Yet, these can sometimes come\nacross as impersonal or uninspiring. A novel method would be to employ large\nlanguage models (LLMs) for this purpose, which we denote as ''LLM-based''. To\nassess the effectiveness of LLMs in generating more resonant explanations, we\nconducted a pilot study with 25 participants. They were presented with three\nexplanations: (1) traditional template-based, (2) LLM-based rephrasing of the\ntemplate output, and (3) purely LLM-based explanations derived from the\ngraph-based explanations. Although subject to high variance, preliminary\nfindings suggest that LLM-based explanations may provide a richer and more\nengaging user experience, further aligning with user expectations. This study\nsheds light on the potential limitations of current explanation methods and\noffers promising directions for leveraging large language models to improve\nuser satisfaction and trust in recommender systems.\n","authors":["Julien Albert","Martin Balfroid","Miriam Doh","Jeremie Bogaert","Luca La Fisca","Liesbet De Vos","Bryan Renard","Vincent Stragier","Emmanuel Jean"],"pdf_url":"https://arxiv.org/pdf/2409.06297v1.pdf","comment":"Presented to the Dutch-Belgian Workshop on Recommender Systems 2023\n (14-15 December, 2023 - Antwerp, Belgium)"},{"id":"http://arxiv.org/abs/2409.06226v1","updated":"2024-09-10T05:41:40Z","published":"2024-09-10T05:41:40Z","title":"NLP-Powered Repository and Search Engine for Academic Papers: A Case\n Study on Cyber Risk Literature with CyLit","summary":" As the body of academic literature continues to grow, researchers face\nincreasing difficulties in effectively searching for relevant resources.\nExisting databases and search engines often fall short of providing a\ncomprehensive and contextually relevant collection of academic literature. To\naddress this issue, we propose a novel framework that leverages Natural\nLanguage Processing (NLP) techniques. This framework automates the retrieval,\nsummarization, and clustering of academic literature within a specific research\ndomain. To demonstrate the effectiveness of our approach, we introduce CyLit,\nan NLP-powered repository specifically designed for the cyber risk literature.\nCyLit empowers researchers by providing access to context-specific resources\nand enabling the tracking of trends in the dynamic and rapidly evolving field\nof cyber risk. Through the automatic processing of large volumes of data, our\nNLP-powered solution significantly enhances the efficiency and specificity of\nacademic literature searches. We compare the literature categorization results\nof CyLit to those presented in survey papers or generated by ChatGPT,\nhighlighting the distinctive insights this tool provides into cyber risk\nresearch literature. Using NLP techniques, we aim to revolutionize the way\nresearchers discover, analyze, and utilize academic resources, ultimately\nfostering advancements in various domains of knowledge.\n","authors":["Linfeng Zhang","Changyue Hu","Zhiyu Quan"],"pdf_url":"https://arxiv.org/pdf/2409.06226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06177v1","updated":"2024-09-10T03:12:39Z","published":"2024-09-10T03:12:39Z","title":"HierLLM: Hierarchical Large Language Model for Question Recommendation","summary":" Question recommendation is a task that sequentially recommends questions for\nstudents to enhance their learning efficiency. That is, given the learning\nhistory and learning target of a student, a question recommender is supposed to\nselect the question that will bring the most improvement for students. Previous\nmethods typically model the question recommendation as a sequential\ndecision-making problem, estimating students' learning state with the learning\nhistory, and feeding the learning state with the learning target to a neural\nnetwork to select the recommended question from a question set. However,\nprevious methods are faced with two challenges: (1) learning history is\nunavailable in the cold start scenario, which makes the recommender generate\ninappropriate recommendations; (2) the size of the question set is much large,\nwhich makes it difficult for the recommender to select the best question\nprecisely. To address the challenges, we propose a method called hierarchical\nlarge language model for question recommendation (HierLLM), which is a\nLLM-based hierarchical structure. The LLM-based structure enables HierLLM to\ntackle the cold start issue with the strong reasoning abilities of LLM. The\nhierarchical structure takes advantage of the fact that the number of concepts\nis significantly smaller than the number of questions, narrowing the range of\nselectable questions by first identifying the relevant concept for the\nto-recommend question, and then selecting the recommended question based on\nthat concept. This hierarchical structure reduces the difficulty of the\nrecommendation.To investigate the performance of HierLLM, we conduct extensive\nexperiments, and the results demonstrate the outstanding performance of\nHierLLM.\n","authors":["Yuxuan Liu","Haipeng Liu","Ting Long"],"pdf_url":"https://arxiv.org/pdf/2409.06177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06150v1","updated":"2024-09-10T01:52:29Z","published":"2024-09-10T01:52:29Z","title":"What makes a good concept anyway ?","summary":" A good medical ontology is expected to cover its domain completely and\ncorrectly. On the other hand, large ontologies are hard to build, hard to\nunderstand, and hard to maintain. Thus, adding new concepts (often multi-word\nconcepts) to an existing ontology must be done judiciously. Only \"good\"\nconcepts should be added; however, it is difficult to define what makes a\nconcept good. In this research, we propose a metric to measure the goodness of\na concept. We identified factors that appear to influence goodness judgments of\nmedical experts and combined them into a single metric. These factors include\nconcept name length (in words), concept occurrence frequency in the medical\nliterature, and syntactic categories of component words. As an added factor, we\nused the simplicity of a term after mapping it into a specific foreign\nlanguage. We performed Bayesian optimization of factor weights to achieve\nmaximum agreement between the metric and three medical experts. The results\nshowed that our metric had a 50.67% overall agreement with the experts, as\nmeasured by Krippendorff's alpha.\n","authors":["Naren Khatwani","James Geller"],"pdf_url":"https://arxiv.org/pdf/2409.06150v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.06690v1","updated":"2024-09-10T17:54:00Z","published":"2024-09-10T17:54:00Z","title":"Benchmarking Sub-Genre Classification For Mainstage Dance Music","summary":" Music classification, with a wide range of applications, is one of the most\nprominent tasks in music information retrieval. To address the absence of\ncomprehensive datasets and high-performing methods in the classification of\nmainstage dance music, this work introduces a novel benchmark comprising a new\ndataset and a baseline. Our dataset extends the number of sub-genres to cover\nmost recent mainstage live sets by top DJs worldwide in music festivals. A\ncontinuous soft labeling approach is employed to account for tracks that span\nmultiple sub-genres, preserving the inherent sophistication. For the baseline,\nwe developed deep learning models that outperform current state-of-the-art\nmultimodel language models, which struggle to identify house music sub-genres,\nemphasizing the need for specialized models trained on fine-grained datasets.\nOur benchmark is applicable to serve for application scenarios such as music\nrecommendation, DJ set curation, and interactive multimedia, where we also\nprovide video demos. Our code is on\n\\url{https://anonymous.4open.science/r/Mainstage-EDM-Benchmark/}.\n","authors":["Hongzhi Shu","Xinglin Li","Hongyu Jiang","Minghao Fu","Xinyu Li"],"pdf_url":"https://arxiv.org/pdf/2409.06690v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2404.09654v2","updated":"2024-09-10T11:58:23Z","published":"2024-04-15T10:42:22Z","title":"Do LLMs Understand Visual Anomalies? Uncovering LLM's Capabilities in\n Zero-shot Anomaly Detection","summary":" Large vision-language models (LVLMs) are markedly proficient in deriving\nvisual representations guided by natural language. Recent explorations have\nutilized LVLMs to tackle zero-shot visual anomaly detection (VAD) challenges by\npairing images with textual descriptions indicative of normal and abnormal\nconditions, referred to as anomaly prompts. However, existing approaches depend\non static anomaly prompts that are prone to cross-semantic ambiguity, and\nprioritize global image-level representations over crucial local pixel-level\nimage-to-text alignment that is necessary for accurate anomaly localization. In\nthis paper, we present ALFA, a training-free approach designed to address these\nchallenges via a unified model. We propose a run-time prompt adaptation\nstrategy, which first generates informative anomaly prompts to leverage the\ncapabilities of a large language model (LLM). This strategy is enhanced by a\ncontextual scoring mechanism for per-image anomaly prompt adaptation and\ncross-semantic ambiguity mitigation. We further introduce a novel fine-grained\naligner to fuse local pixel-level semantics for precise anomaly localization,\nby projecting the image-text alignment from global to local semantic spaces.\nExtensive evaluations on MVTec and VisA datasets confirm ALFA's effectiveness\nin harnessing the language potential for zero-shot VAD, achieving significant\nPRO improvements of 12.1% on MVTec and 8.9% on VisA compared to\nstate-of-the-art approaches.\n","authors":["Jiaqi Zhu","Shaofeng Cai","Fang Deng","Beng Chin Ooi","Junran Wu"],"pdf_url":"https://arxiv.org/pdf/2404.09654v2.pdf","comment":"Accepted by MM'24 (Oral)"},{"id":"http://arxiv.org/abs/2409.06371v1","updated":"2024-09-10T09:53:06Z","published":"2024-09-10T09:53:06Z","title":"Distilling Generative-Discriminative Representations for Very\n Low-Resolution Face Recognition","summary":" Very low-resolution face recognition is challenging due to the serious loss\nof informative facial details in resolution degradation. In this paper, we\npropose a generative-discriminative representation distillation approach that\ncombines generative representation with cross-resolution aligned knowledge\ndistillation. This approach facilitates very low-resolution face recognition by\njointly distilling generative and discriminative models via two distillation\nmodules. Firstly, the generative representation distillation takes the encoder\nof a diffusion model pretrained for face super-resolution as the generative\nteacher to supervise the learning of the student backbone via feature\nregression, and then freezes the student backbone. After that, the\ndiscriminative representation distillation further considers a pretrained face\nrecognizer as the discriminative teacher to supervise the learning of the\nstudent head via cross-resolution relational contrastive distillation. In this\nway, the general backbone representation can be transformed into discriminative\nhead representation, leading to a robust and discriminative student model for\nvery low-resolution face recognition. Our approach improves the recovery of the\nmissing details in very low-resolution faces and achieves better knowledge\ntransfer. Extensive experiments on face datasets demonstrate that our approach\nenhances the recognition accuracy of very low-resolution faces, showcasing its\neffectiveness and adaptability.\n","authors":["Junzheng Zhang","Weijia Guo","Bochao Liu","Ruixin Shi","Yong Li","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2409.06371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04388v3","updated":"2024-09-10T09:46:58Z","published":"2024-09-06T16:27:52Z","title":"Question-Answering Dense Video Events","summary":" Multimodal Large Language Models (MLLMs) have shown excellent performance in\nquestion-answering of single-event videos. In this paper, we present\nquestion-answering dense video events, a novel task that requires answering and\ngrounding the dense-event questions in long videos, thus challenging MLLMs to\nfaithfully comprehend and reason about multiple events occurring over extended\ntime periods. To facilitate the study, we construct DeVE-QA - a dataset\nfeaturing 78K questions about 26K events on 10.6K long videos. We then\nbenchmark and show that existing MLLMs excelling at single-event QA struggle to\nperform well in DeVE-QA. For improvement, we propose DeVi, a novel\ntraining-free MLLM approach that highlights a hierarchical captioning module, a\ntemporal event memory module, and a self-consistency checking module to\nrespectively detect, contextualize and memorize, and ground dense-events in\nlong videos for question answering. Extensive experiments show that DeVi is\nsuperior at answering dense-event questions and grounding relevant video\nmoments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1\npercent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA\nrespectively.\n","authors":["Hangyu Qin","Junbin Xiao","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2409.04388v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02453v2","updated":"2024-09-10T08:20:36Z","published":"2024-09-04T05:19:57Z","title":"FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video\n Reconstruction in Resource and Timing Constrained Network Settings","summary":" Despite the growing adoption of video processing via Internet of Things (IoT)\ndevices due to their cost-effectiveness, transmitting captured data to nearby\nservers poses challenges due to varying timing constraints and scarcity of\nnetwork bandwidth. Existing video compression methods face difficulties in\nrecovering compressed data when incomplete data is provided. Here, we introduce\nFrameCorr, a deep-learning based solution that utilizes previously received\ndata to predict the missing segments of a frame, enabling the reconstruction of\na frame from partially received data.\n","authors":["John Li","Shehab Sarar Ahmed","Deepak Nair"],"pdf_url":"https://arxiv.org/pdf/2409.02453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08564v2","updated":"2024-09-10T07:30:02Z","published":"2024-06-12T18:07:06Z","title":"Machine Learning-Driven Open-Source Framework for Assessing QoE in\n Multimedia Networks","summary":" The Internet is integral to modern life, influencing communication, business,\nand lifestyles globally. As dependence on Internet services grows, the demand\nfor high-quality service delivery increases. Service providers must maintain\nhigh standards of quality of service and quality of experience (QoE) to ensure\nuser satisfaction. QoE, which reflects user satisfaction with service quality,\nis a key metric for multimedia services, yet it is challenging to measure due\nto its subjective nature and the complexities of real-time feedback. This paper\nintroduces a machine learning-based framework for objectively assessing QoE in\nmultimedia networks. The open-source framework complies with the ITU-T P.1203\nstandard. It automates data collection and user satisfaction prediction using\nkey network parameters such as delay, jitter, packet loss, bitrate, and\nthroughput. Using a dataset of over 20,000 records from various network\nconditions, the Random Forest model predicts the mean opinion score with 95.8%\naccuracy. Our framework addresses the limitations of existing QoE models by\nintegrating real-time data collection, machine learning predictions, and\nadherence to international standards. This approach enhances QoE evaluation\naccuracy and allows dynamic network resource management, optimizing performance\nand cost-efficiency. Its open-source nature encourages adaptation and extension\nfor various multimedia services. The findings significantly affect the\ntelecommunications industry in managing and optimizing multimedia services. The\nnetwork centric QoE prediction of the framework offers a scalable solution to\nimprove user satisfaction without the need for content-specific data. Future\nenhancements could include advanced machine learning models and broader\napplicability to digital services. This research contributes a practical,\nstandardized tool for QoE assessment across diverse networks and platforms.\n","authors":["Parsa Hassani Shariat Panahi","Amir Hossein Jalilvand","Abolfazl Diyanat"],"pdf_url":"https://arxiv.org/pdf/2406.08564v2.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.06224v1","updated":"2024-09-10T05:28:38Z","published":"2024-09-10T05:28:38Z","title":"MIP-GAF: A MLLM-annotated Benchmark for Most Important Person\n Localization and Group Context Understanding","summary":" Estimating the Most Important Person (MIP) in any social event setup is a\nchallenging problem mainly due to contextual complexity and scarcity of labeled\ndata. Moreover, the causality aspects of MIP estimation are quite subjective\nand diverse. To this end, we aim to address the problem by annotating a\nlarge-scale `in-the-wild' dataset for identifying human perceptions about the\n`Most Important Person (MIP)' in an image. The paper provides a thorough\ndescription of our proposed Multimodal Large Language Model (MLLM) based data\nannotation strategy, and a thorough data quality analysis. Further, we perform\na comprehensive benchmarking of the proposed dataset utilizing state-of-the-art\nMIP localization methods, indicating a significant drop in performance compared\nto existing datasets. The performance drop shows that the existing MIP\nlocalization algorithms must be more robust with respect to `in-the-wild'\nsituations. We believe the proposed dataset will play a vital role in building\nthe next-generation social situation understanding methods. The code and data\nis available at https://github.com/surbhimadan92/MIP-GAF.\n","authors":["Surbhi Madan","Shreya Ghosh","Lownish Rai Sookha","M. A. Ganaie","Ramanathan Subramanian","Abhinav Dhall","Tom Gedeon"],"pdf_url":"https://arxiv.org/pdf/2409.06224v1.pdf","comment":"Accepted for publication at WACV 2025"},{"id":"http://arxiv.org/abs/2409.06207v1","updated":"2024-09-10T04:24:22Z","published":"2024-09-10T04:24:22Z","title":"Design and Implementation of Online Live Streaming System Using A 3D\n Engine","summary":" With the growing demand for live video streaming, there is an increasing need\nfor low-latency and high-quality transmission, especially with the advent of 5G\nnetworks. While 5G offers hardware-level improvements, effective software\nsolutions for minimizing latency remain essential. Current methods, such as\nmulti-channel streaming, fail to address latency issues fundamentally, often\nonly adding new channels without optimizing overall performance. This thesis\nproposes a novel approach using a 3D engine (e.g., Unity 3D) to stream\nmulti-input video data through a single channel with reduced latency. By\nleveraging 3D engine capabilities, such as World/Screen Space Cameras, 3D\nCanvases, and Webcam Textures, the proposed system consolidates video streams\nfrom multiple external cameras into a unified, low-latency output. The\naffiliated project of this thesis demonstrates the implementation of a\nlow-latency multi-channel live video streaming system. It employs the RTSP\nprotocol and examines video encoding techniques, alongside a client-side\napplication based on Unity 3D. The system architecture includes a WebSocket\nserver for persistent connections, an HTTP server for communication, a MySQL\ndatabase for storage, Redis for caching, and Nginx for load balancing. Each\nmodule operates independently, ensuring flexibility and scalability in the\nsystem's design. A key innovation of this system is its use of a 3D scene to\nmap multiple video inputs onto a virtual canvas, recorded by an in-engine\ncamera for transmission. This design minimizes redundant data, enabling an\nefficient and director-guided live streaming network. The thesis concludes by\ndiscussing challenges encountered during the project and provides solutions for\nfuture improvement.\n","authors":["Aizierjiang Aiersilan"],"pdf_url":"https://arxiv.org/pdf/2409.06207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06135v1","updated":"2024-09-10T01:07:20Z","published":"2024-09-10T01:07:20Z","title":"Draw an Audio: Leveraging Multi-Instruction for Video-to-Audio Synthesis","summary":" Foley is a term commonly used in filmmaking, referring to the addition of\ndaily sound effects to silent films or videos to enhance the auditory\nexperience. Video-to-Audio (V2A), as a particular type of automatic foley task,\npresents inherent challenges related to audio-visual synchronization. These\nchallenges encompass maintaining the content consistency between the input\nvideo and the generated audio, as well as the alignment of temporal and\nloudness properties within the video. To address these issues, we construct a\ncontrollable video-to-audio synthesis model, termed Draw an Audio, which\nsupports multiple input instructions through drawn masks and loudness signals.\nTo ensure content consistency between the synthesized audio and target video,\nwe introduce the Mask-Attention Module (MAM), which employs masked video\ninstruction to enable the model to focus on regions of interest. Additionally,\nwe implement the Time-Loudness Module (TLM), which uses an auxiliary loudness\nsignal to ensure the synthesis of sound that aligns with the video in both\nloudness and temporal dimensions. Furthermore, we have extended a large-scale\nV2A dataset, named VGGSound-Caption, by annotating caption prompts. Extensive\nexperiments on challenging benchmarks across two large-scale V2A datasets\nverify Draw an Audio achieves the state-of-the-art. Project page:\nhttps://yannqi.github.io/Draw-an-Audio/.\n","authors":["Qi Yang","Binjie Mao","Zili Wang","Xing Nie","Pengfei Gao","Ying Guo","Cheng Zhen","Pengfei Yan","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2409.06135v1.pdf","comment":"14 pages, 11 figures"}]},"2024-09-09T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.05667v2","updated":"2024-09-09T23:46:59Z","published":"2024-08-11T01:14:13Z","title":"PhishLang: A Lightweight, Client-Side Phishing Detection Framework using\n MobileBERT for Real-Time, Explainable Threat Mitigation","summary":" In this paper, we introduce PhishLang, an open-source, lightweight language\nmodel specifically designed for phishing website detection through contextual\nanalysis of the website. Unlike traditional heuristic or machine learning\nmodels that rely on static features and struggle to adapt to new threats, and\ndeep learning models that are computationally intensive, our model leverages\nMobileBERT, a fast and memory-efficient variant of the BERT architecture, to\nlearn granular features characteristic of phishing attacks. PhishLang operates\nwith minimal data preprocessing and offers performance comparable to leading\ndeep learning anti-phishing tools, while being significantly faster and less\nresource-intensive. Over a 3.5-month testing period, PhishLang successfully\nidentified 25,796 phishing URLs, many of which were undetected by popular\nantiphishing blocklists, thus demonstrating its potential to enhance current\ndetection measures. Capitalizing on PhishLang's resource efficiency, we release\nthe first open-source fully client-side Chromium browser extension that\nprovides inference locally without requiring to consult an online blocklist and\ncan be run on low-end systems with no impact on inference times. Our\nimplementation not only outperforms prevalent (server-side) phishing tools, but\nis significantly more effective than the limited commercial client-side\nmeasures available. Furthermore, we study how PhishLang can be integrated with\nGPT-3.5 Turbo to create explainable blocklisting -- which, upon detection of a\nwebsite, provides users with detailed contextual information about the features\nthat led to a website being marked as phishing.\n","authors":["Sayak Saha Roy","Shirin Nilizadeh"],"pdf_url":"https://arxiv.org/pdf/2408.05667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05806v1","updated":"2024-09-09T17:11:51Z","published":"2024-09-09T17:11:51Z","title":"Benchmarking Chinese Knowledge Rectification in Large Language Models","summary":" While Large Language Models (LLMs) exhibit remarkable generative\ncapabilities, they are not without flaws, particularly in the form of\nhallucinations. This issue is even more pronounced when LLMs are applied to\nspecific languages and domains. For example, LLMs may generate nonsense\ninformation when handling Chinese ancient poetry, proverbs, or idioms, owing to\nthe lack of specific knowledge. To this end, this paper introduces a benchmark\nfor rectifying Chinese knowledge in LLMs via knowledge editing. Specifically,\nwe introduce a new Chinese dataset, CKnowEdit, by collecting seven type of\nknowledge from various sources, including classical texts, idioms, and content\nfrom Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony,\nantithesis, and logical constructs inherent in the Chinese language. Through\nthe analysis of this dataset, we uncover the challenges faced by current LLMs\nin mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge\nediting techniques on this dataset unveil the substantial scope for advancement\nin the rectification of Chinese knowledge. Code and dataset are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Tianhe Lu","Jizhan Fang","Yunzhi Yao","Xin Xu","Ningyu Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.05806v1.pdf","comment":"Ongoing work; code and dataset are available at\n https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2409.07497v1","updated":"2024-09-09T16:46:47Z","published":"2024-09-09T16:46:47Z","title":"OneEdit: A Neural-Symbolic Collaboratively Knowledge Editing System","summary":" Knowledge representation has been a central aim of AI since its inception.\nSymbolic Knowledge Graphs (KGs) and neural Large Language Models (LLMs) can\nboth represent knowledge. KGs provide highly accurate and explicit knowledge\nrepresentation, but face scalability issue; while LLMs offer expansive coverage\nof knowledge, but incur significant training costs and struggle with precise\nand reliable knowledge manipulation. To this end, we introduce OneEdit, a\nneural-symbolic prototype system for collaborative knowledge editing using\nnatural language, which facilitates easy-to-use knowledge management with KG\nand LLM. OneEdit consists of three modules: 1) The Interpreter serves for user\ninteraction with natural language; 2) The Controller manages editing requests\nfrom various users, leveraging the KG with rollbacks to handle knowledge\nconflicts and prevent toxic knowledge attacks; 3) The Editor utilizes the\nknowledge from the Controller to edit KG and LLM. We conduct experiments on two\nnew datasets with KGs which demonstrate that OneEdit can achieve superior\nperformance.\n","authors":["Ningyu Zhang","Zekun Xi","Yujie Luo","Peng Wang","Bozhong Tian","Yunzhi Yao","Jintian Zhang","Shumin Deng","Mengshu Sun","Lei Liang","Zhiqiang Zhang","Xiaowei Zhu","Jun Zhou","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.07497v1.pdf","comment":"LLM+KG@VLDB2024, code is available at\n https://github.com/zjunlp/OneEdit"},{"id":"http://arxiv.org/abs/2409.05692v1","updated":"2024-09-09T15:05:27Z","published":"2024-09-09T15:05:27Z","title":"Extracting the U.S. building types from OpenStreetMap data","summary":" Building type information is crucial for population estimation, traffic\nplanning, urban planning, and emergency response applications. Although\nessential, such data is often not readily available. To alleviate this problem,\nthis work creates a comprehensive dataset by providing\nresidential/non-residential building classification covering the entire United\nStates. We propose and utilize an unsupervised machine learning method to\nclassify building types based on building footprints and available\nOpenStreetMap information. The classification result is validated using\nauthoritative ground truth data for select counties in the U.S. The validation\nshows a high precision for non-residential building classification and a high\nrecall for residential buildings. We identified various approaches to improving\nthe quality of the classification, such as removing sheds and garages from the\ndataset. Furthermore, analyzing the misclassifications revealed that they are\nmainly due to missing and scarce metadata in OSM. A major result of this work\nis the resulting dataset of classifying 67,705,475 buildings. We hope that this\ndata is of value to the scientific community, including urban and\ntransportation planners.\n","authors":["Henrique F. de Arruda","Sandro M. Reia","Shiyang Ruan","Kuldip S. Atwal","Hamdi Kavak","Taylor Anderson","Dieter Pfoser"],"pdf_url":"https://arxiv.org/pdf/2409.05692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05677v1","updated":"2024-09-09T14:44:19Z","published":"2024-09-09T14:44:19Z","title":"RegNLP in Action: Facilitating Compliance Through Automated Information\n Retrieval and Answer Generation","summary":" Regulatory documents, issued by governmental regulatory bodies, establish\nrules, guidelines, and standards that organizations must adhere to for legal\ncompliance. These documents, characterized by their length, complexity and\nfrequent updates, are challenging to interpret, requiring significant\nallocation of time and expertise on the part of organizations to ensure ongoing\ncompliance.Regulatory Natural Language Processing (RegNLP) is a\nmultidisciplinary subfield aimed at simplifying access to and interpretation of\nregulatory rules and obligations. We define an Automated Question-Passage\nGeneration task for RegNLP, create the ObliQA dataset containing 27,869\nquestions derived from the Abu Dhabi Global Markets (ADGM) financial regulation\ndocument collection, design a baseline Regulatory Information Retrieval and\nAnswer Generation system, and evaluate it with RePASs, a novel evaluation\nmetric that tests whether generated answers accurately capture all relevant\nobligations and avoid contradictions.\n","authors":["Tuba Gokhan","Kexin Wang","Iryna Gurevych","Ted Briscoe"],"pdf_url":"https://arxiv.org/pdf/2409.05677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05633v1","updated":"2024-09-09T14:04:17Z","published":"2024-09-09T14:04:17Z","title":"Enhancing Graph Contrastive Learning with Reliable and Informative\n Augmentation for Recommendation","summary":" Graph neural network (GNN) has been a powerful approach in collaborative\nfiltering (CF) due to its ability to model high-order user-item relationships.\nRecently, to alleviate the data sparsity and enhance representation learning,\nmany efforts have been conducted to integrate contrastive learning (CL) with\nGNNs. Despite the promising improvements, the contrastive view generation based\non structure and representation perturbations in existing methods potentially\ndisrupts the collaborative information in contrastive views, resulting in\nlimited effectiveness of positive alignment. To overcome this issue, we propose\nCoGCL, a novel framework that aims to enhance graph contrastive learning by\nconstructing contrastive views with stronger collaborative information via\ndiscrete codes. The core idea is to map users and items into discrete codes\nrich in collaborative information for reliable and informative contrastive view\ngeneration. To this end, we initially introduce a multi-level vector quantizer\nin an end-to-end manner to quantize user and item representations into discrete\ncodes. Based on these discrete codes, we enhance the collaborative information\nof contrastive views by considering neighborhood structure and semantic\nrelevance respectively. For neighborhood structure, we propose virtual neighbor\naugmentation by treating discrete codes as virtual neighbors, which expands an\nobserved user-item interaction into multiple edges involving discrete codes.\nRegarding semantic relevance, we identify similar users/items based on shared\ndiscrete codes and interaction targets to generate the semantically relevant\nview. Through these strategies, we construct contrastive views with stronger\ncollaborative information and develop a triple-view graph contrastive learning\napproach. Extensive experiments on four public datasets demonstrate the\neffectiveness of our proposed approach.\n","authors":["Bowen Zheng","Junjie Zhang","Hongyu Lu","Yu Chen","Ming Chen","Wayne Xin Zhao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2409.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05546v1","updated":"2024-09-09T12:11:53Z","published":"2024-09-09T12:11:53Z","title":"End-to-End Learnable Item Tokenization for Generative Recommendation","summary":" Recently, generative recommendation has emerged as a promising new paradigm\nthat directly generates item identifiers for recommendation. However, a key\nchallenge lies in how to effectively construct item identifiers that are\nsuitable for recommender systems. Existing methods typically decouple item\ntokenization from subsequent generative recommendation training, likely\nresulting in suboptimal performance. To address this limitation, we propose\nETEGRec, a novel End-To-End Generative Recommender by seamlessly integrating\nitem tokenization and generative recommendation. Our framework is developed\nbased on the dual encoder-decoder architecture, which consists of an item\ntokenizer and a generative recommender. In order to achieve mutual enhancement\nbetween the two components, we propose a recommendation-oriented alignment\napproach by devising two specific optimization objectives: sequence-item\nalignment and preference-semantic alignment. These two alignment objectives can\neffectively couple the learning of item tokenizer and generative recommender,\nthereby fostering the mutual enhancement between the two components. Finally,\nwe further devise an alternating optimization method, to facilitate stable and\neffective end-to-end learning of the entire framework. Extensive experiments\ndemonstrate the effectiveness of our proposed framework compared to a series of\ntraditional sequential recommendation models and generative recommendation\nbaselines.\n","authors":["Enze Liu","Bowen Zheng","Cheng Ling","Lantao Hu","Han Li","Wayne Xin Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.05546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05512v1","updated":"2024-09-09T11:10:45Z","published":"2024-09-09T11:10:45Z","title":"DatAasee -- A Metadata-Lake as Metadata Catalog for a Virtual Data-Lake","summary":" Metadata management for distributed data sources is a long-standing but\never-growing problem. To counter this challenge in a research-data and\nlibrary-oriented setting, this work constructs a data architecture, derived\nfrom the data-lake: the metadata-lake. A proof-of-concept implementation of\nthis proposed metadata system is presented and evaluated as well.\n","authors":["Christian Himpe"],"pdf_url":"https://arxiv.org/pdf/2409.05512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03753v2","updated":"2024-09-09T10:04:00Z","published":"2024-09-05T17:59:15Z","title":"WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild","summary":" The increasing availability of real-world conversation data offers exciting\nopportunities for researchers to study user-chatbot interactions. However, the\nsheer volume of this data makes manually examining individual conversations\nimpractical. To overcome this challenge, we introduce WildVis, an interactive\ntool that enables fast, versatile, and large-scale conversation analysis.\nWildVis provides search and visualization capabilities in the text and\nembedding spaces based on a list of criteria. To manage million-scale datasets,\nwe implemented optimizations including search index construction, embedding\nprecomputation and compression, and caching to ensure responsive user\ninteractions within seconds. We demonstrate WildVis' utility through three case\nstudies: facilitating chatbot misuse research, visualizing and comparing topic\ndistributions across datasets, and characterizing user-specific conversation\npatterns. WildVis is open-source and designed to be extendable, supporting\nadditional datasets and customized search and visualization functionalities.\n","authors":["Yuntian Deng","Wenting Zhao","Jack Hessel","Xiang Ren","Claire Cardie","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2409.03753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05461v1","updated":"2024-09-09T09:42:31Z","published":"2024-09-09T09:42:31Z","title":"Recommender Systems Algorithm Selection for Ranking Prediction on\n Implicit Feedback Datasets","summary":" The recommender systems algorithm selection problem for ranking prediction on\nimplicit feedback datasets is under-explored. Traditional approaches in\nrecommender systems algorithm selection focus predominantly on rating\nprediction on explicit feedback datasets, leaving a research gap for ranking\nprediction on implicit feedback datasets. Algorithm selection is a critical\nchallenge for nearly every practitioner in recommender systems. In this work,\nwe take the first steps toward addressing this research gap. We evaluate the\nNDCG@10 of 24 recommender systems algorithms, each with two hyperparameter\nconfigurations, on 72 recommender systems datasets. We train four optimized\nmachine-learning meta-models and one automated machine-learning meta-model with\nthree different settings on the resulting meta-dataset. Our results show that\nthe predictions of all tested meta-models exhibit a median Spearman correlation\nranging from 0.857 to 0.918 with the ground truth. We show that the median\nSpearman correlation between meta-model predictions and the ground truth\nincreases by an average of 0.124 when the meta-model is optimized to predict\nthe ranking of algorithms instead of their performance. Furthermore, in terms\nof predicting the best algorithm for an unknown dataset, we demonstrate that\nthe best optimized traditional meta-model, e.g., XGBoost, achieves a recall of\n48.6%, outperforming the best tested automated machine learning meta-model,\ne.g., AutoGluon, which achieves a recall of 47.2%.\n","authors":["Lukas Wegmeth","Tobias Vente","Joeran Beel"],"pdf_url":"https://arxiv.org/pdf/2409.05461v1.pdf","comment":"Accepted for presentation at the 18th ACM Conference on Recommender\n Systems in the Late-Breaking Results Track"},{"id":"http://arxiv.org/abs/2409.05925v1","updated":"2024-09-09T08:29:39Z","published":"2024-09-09T08:29:39Z","title":"Assessing SPARQL capabilities of Large Language Models","summary":" The integration of Large Language Models (LLMs) with Knowledge Graphs (KGs)\noffers significant synergistic potential for knowledge-driven applications. One\npossible integration is the interpretation and generation of formal languages,\nsuch as those used in the Semantic Web, with SPARQL being a core technology for\naccessing KGs. In this paper, we focus on measuring out-of-the box capabilities\nof LLMs to work with SPARQL and more specifically with SPARQL SELECT queries\napplying a quantitative approach.\n We implemented various benchmarking tasks in the LLM-KG-Bench framework for\nautomated execution and evaluation with several LLMs. The tasks assess\ncapabilities along the dimensions of syntax, semantic read, semantic create,\nand the role of knowledge graph prompt inclusion.\n With this new benchmarking tasks, we evaluated a selection of GPT, Gemini,\nand Claude models. Our findings indicate that working with SPARQL SELECT\nqueries is still challenging for LLMs and heavily depends on the specific LLM\nas well as the complexity of the task. While fixing basic syntax errors seems\nto pose no problems for the best of the current LLMs evaluated, creating\nsemantically correct SPARQL SELECT queries is difficult in several cases.\n","authors":["Lars-Peter Meyer","Johannes Frey","Felix Brei","Natanael Arndt"],"pdf_url":"https://arxiv.org/pdf/2409.05925v1.pdf","comment":"peer reviewed publication at NLP4KGc @ Semantics 2024, see\n https://sites.google.com/view/3rdnlp4kgc"},{"id":"http://arxiv.org/abs/2409.05417v1","updated":"2024-09-09T08:19:43Z","published":"2024-09-09T08:19:43Z","title":"Replicability Measures for Longitudinal Information Retrieval Evaluation","summary":" Information Retrieval (IR) systems are exposed to constant changes in most\ncomponents. Documents are created, updated, or deleted, the information needs\nare changing, and even relevance might not be static. While it is generally\nexpected that the IR systems retain a consistent utility for the users, test\ncollection evaluations rely on a fixed experimental setup. Based on the\nLongEval shared task and test collection, this work explores how the\neffectiveness measured in evolving experiments can be assessed. Specifically,\nthe persistency of effectiveness is investigated as a replicability task. It is\nobserved how the effectiveness progressively deteriorates over time compared to\nthe initial measurement. Employing adapted replicability measures provides\nfurther insight into the persistence of effectiveness. The ranking of systems\nvaries across retrieval measures and time. In conclusion, it was found that the\nmost effective systems are not necessarily the ones with the most persistent\nperformance.\n","authors":["Jüri Keller","Timo Breuer","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2409.05417v1.pdf","comment":"Experimental IR Meets Multilinguality, Multimodality, and Interaction\n - 15th International Conference of the CLEF Association, CLEF 2024, Grenoble,\n France, September 9-12, 2024, Proceedings. arXiv admin note: text overlap\n with arXiv:2308.10549"},{"id":"http://arxiv.org/abs/2409.05401v1","updated":"2024-09-09T07:57:43Z","published":"2024-09-09T07:57:43Z","title":"NLLB-E5: A Scalable Multilingual Retrieval Model","summary":" Despite significant progress in multilingual information retrieval, the lack\nof models capable of effectively supporting multiple languages, particularly\nlow-resource like Indic languages, remains a critical challenge. This paper\npresents NLLB-E5: A Scalable Multilingual Retrieval Model. NLLB-E5 leverages\nthe in-built multilingual capabilities in the NLLB encoder for translation\ntasks. It proposes a distillation approach from multilingual retriever E5 to\nprovide a zero-shot retrieval approach handling multiple languages, including\nall major Indic languages, without requiring multilingual training data. We\nevaluate the model on a comprehensive suite of existing benchmarks, including\nHindi-BEIR, highlighting its robust performance across diverse languages and\ntasks. Our findings uncover task and domain-specific challenges, providing\nvaluable insights into the retrieval performance, especially for low-resource\nlanguages. NLLB-E5 addresses the urgent need for an inclusive, scalable, and\nlanguage-agnostic text retrieval model, advancing the field of multilingual\ninformation access and promoting digital inclusivity for millions of users\nglobally.\n","authors":["Arkadeep Acharya","Rudra Murthy","Vishwajeet Kumar","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2409.05401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03893v2","updated":"2024-09-09T07:47:58Z","published":"2024-09-05T19:59:42Z","title":"Understanding Fairness in Recommender Systems: A Healthcare Perspective","summary":" Fairness in AI-driven decision-making systems has become a critical concern,\nespecially when these systems directly affect human lives. This paper explores\nthe public's comprehension of fairness in healthcare recommendations. We\nconducted a survey where participants selected from four fairness metrics --\nDemographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive\nValue -- across different healthcare scenarios to assess their understanding of\nthese concepts. Our findings reveal that fairness is a complex and often\nmisunderstood concept, with a generally low level of public understanding\nregarding fairness metrics in recommender systems. This study highlights the\nneed for enhanced information and education on algorithmic fairness to support\ninformed decision-making in using these systems. Furthermore, the results\nsuggest that a one-size-fits-all approach to fairness may be insufficient,\npointing to the importance of context-sensitive designs in developing equitable\nAI systems.\n","authors":["Veronica Kecki","Alan Said"],"pdf_url":"https://arxiv.org/pdf/2409.03893v2.pdf","comment":"Accepted to the 18th ACM Conference on Recommender Systems"},{"id":"http://arxiv.org/abs/2407.14482v2","updated":"2024-09-09T06:19:07Z","published":"2024-07-19T17:35:47Z","title":"ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG\n Capabilities","summary":" In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K\ncontext window, designed to bridge the gap between open-source LLMs and leading\nproprietary models (e.g., GPT-4-Turbo) in long-context understanding and\nretrieval-augmented generation (RAG) capabilities. These two capabilities are\nessential for LLMs to process large volumes of information that cannot fit into\na single prompt and are complementary to each other, depending on the\ndownstream tasks and computational budgets. We present a detailed continued\ntraining recipe to extend the context window of Llama3-70B-base from 8K to 128K\ntokens, along with a three-stage instruction tuning process to enhance the\nmodel's instruction-following, RAG performance, and long-context understanding\ncapabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model\noutperforms most existing state-of-the-art models, including\nGPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on\nultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only\na 4K context window, showing the strong long context capability across varying\nsequence lengths. We further provide extensive comparisons between direct\nlong-context and RAG solutions using the same state-of-the-art long-context\nLLMs. Interestingly, we find that the performance of strong long-context LLMs\nusing RAG improves when retrieving a larger number of chunks. With a large set\nof top-k chunks, RAG consistently outperforms direct long-context solution\nusing the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B\nand Qwen2-72B-Instruct) on both 32K benchmarks and real-world 128K tasks. To\nadvance research in this field, we open-sourced the model weights, training\ndata, and the evaluation setup for the for the community:\nhttps://chatqa2-project.github.io/\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Chejian Xu","Zihan Liu","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2407.14482v2.pdf","comment":"v2: major update with significantly improved results"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.06051v1","updated":"2024-09-09T20:22:00Z","published":"2024-09-09T20:22:00Z","title":"REVISION: A Roadmap on Adaptive Video Streaming Optimization","summary":" Due to the soaring popularity of video applications and the consequent rise\nin video traffic on the Internet, technologies like HTTP Adaptive Streaming\n(HAS) are crucial for delivering high Quality of Experience (QoE) to consumers.\nHAS technology enables video players on consumer devices to enhance viewer\nengagement by dynamically adapting video content quality based on network\nconditions. This is especially relevant for consumer electronics as it ensures\nan optimized viewing experience across a variety of devices, from smartphones\nto smart TVs. This paper introduces REVISION, an efficient roadmap designed to\nenhance adaptive video streaming, a core feature of modern consumer\nelectronics. The REVISION optimization triangle highlights three essential\naspects for improving streaming: Objective, Input Space, and Action Domain.\nAdditionally, REVISION proposes a novel layer-based architecture tailored to\nrefine video streaming systems, comprising Application, Control and Management,\nand Resource layers. Each layer is designed to optimize different components of\nthe streaming process, which is directly linked to the performance and\nefficiency of consumer devices. By adopting the principles of the REVISION,\nmanufacturers and developers can significantly improve the streaming\ncapabilities of consumer electronics, thereby enriching the consumer's\nmultimedia experience and accommodating the increasing demand for high-quality,\nreal-time video content. This approach addresses the complexities of today's\ndiverse video streaming ecosystem and paves the way for future advancements in\nconsumer technology.\n","authors":["Farzad Tashtarian","Christian Timmerer"],"pdf_url":"https://arxiv.org/pdf/2409.06051v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.05772v1","updated":"2024-09-09T16:33:40Z","published":"2024-09-09T16:33:40Z","title":"A CLIP-based siamese approach for meme classification","summary":" Memes are an increasingly prevalent element of online discourse in social\nnetworks, especially among young audiences. They carry ideas and messages that\nrange from humorous to hateful, and are widely consumed. Their potentially high\nimpact requires adequate means of control to moderate their use in large scale.\nIn this work, we propose SimCLIP a deep learning-based architecture for\ncross-modal understanding of memes, leveraging a pre-trained CLIP encoder to\nproduce context-aware embeddings and a Siamese fusion technique to capture the\ninteractions between text and image. We perform an extensive experimentation on\nseven meme classification tasks across six datasets. We establish a new state\nof the art in Memotion7k with a 7.25% relative F1-score improvement, and\nachieve super-human performance on Harm-P with 13.73% F1-Score improvement. Our\napproach demonstrates the potential for compact meme classification models,\nenabling accurate and efficient meme monitoring. We share our code at\nhttps://github.com/jahuerta92/meme-classification-simclip\n","authors":["Javier Huertas-Tato","Christos Koutlis","Symeon Papadopoulos","David Camacho","Ioannis Kompatsiaris"],"pdf_url":"https://arxiv.org/pdf/2409.05772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05750v1","updated":"2024-09-09T16:05:40Z","published":"2024-09-09T16:05:40Z","title":"A Toolkit for Joint Speaker Diarization and Identification with\n Application to Speaker-Attributed ASR","summary":" We present a modular toolkit to perform joint speaker diarization and speaker\nidentification. The toolkit can leverage on multiple models and algorithms\nwhich are defined in a configuration file. Such flexibility allows our system\nto work properly in various conditions (e.g., multiple registered speakers'\nsets, acoustic conditions and languages) and across application domains (e.g.\nmedia monitoring, institutional, speech analytics). In this demonstration we\nshow a practical use-case in which speaker-related information is used jointly\nwith automatic speech recognition engines to generate speaker-attributed\ntranscriptions. To achieve that, we employ a user-friendly web-based interface\nto process audio and video inputs with the chosen configuration.\n","authors":["Giovanni Morrone","Enrico Zovato","Fabio Brugnara","Enrico Sartori","Leonardo Badino"],"pdf_url":"https://arxiv.org/pdf/2409.05750v1.pdf","comment":"Show and Tell paper. Presented at Interspeech 2024"},{"id":"http://arxiv.org/abs/2409.04398v2","updated":"2024-09-09T15:08:06Z","published":"2024-09-06T16:43:04Z","title":"HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale\n Space Using Wearable IMUs and LiDAR","summary":" We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture\nmethod, aimed at accurately and efficiently creating a dynamic digital world,\ncontaining large-scale indoor-outdoor scenes, diverse human motions, rich\nhuman-human interactions, and human-environment interactions. By utilizing\nbody-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human\nmotions in unconstrained space without the need for external devices and\npre-built maps. This affords great flexibility and accessibility for\nhuman-centered interaction and 4D scene capturing in various environments.\nTaking into account that IMUs can capture human spatially unrestricted poses\nbut are prone to drifting for long-period using, and while LiDAR is stable for\nglobal localization but rough for local positions and orientations, HiSC4D\nemploys a joint optimization method, harmonizing all sensors and utilizing\nenvironment cues, yielding promising results for long-term capture in large\nscenes. To promote research of egocentric human interaction in large scenes and\nfacilitate downstream tasks, we also present a dataset, containing 8 sequences\nin 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D\nhuman motions with SMPL annotations and dynamic scenes, 31k frames of cropped\nhuman point clouds, and scene mesh of the environment. A variety of scenarios,\nsuch as the basketball gym and commercial street, alongside challenging human\nmotions, such as daily greeting, one-on-one basketball playing, and tour\nguiding, demonstrate the effectiveness and the generalization ability of\nHiSC4D. The dataset and code will be publicated on\nwww.lidarhumanmotion.net/hisc4d available for research purposes.\n","authors":["Yudi Dai","Zhiyong Wang","Xiping Lin","Chenglu Wen","Lan Xu","Siqi Shen","Yuexin Ma","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04398v2.pdf","comment":"17 pages, 10 figures, Jornal"},{"id":"http://arxiv.org/abs/2309.11500v4","updated":"2024-09-09T14:52:15Z","published":"2023-09-20T17:59:32Z","title":"Auto-ACD: A Large-scale Dataset for Audio-Language Representation\n Learning","summary":" Recently, the AI community has made significant strides in developing\npowerful foundation models, driven by large-scale multimodal datasets. However,\nfor audio representation learning, existing datasets suffer from limitations in\nthe following aspects: insufficient volume, simplistic content, and arduous\ncollection procedures. To establish an audio dataset with high-quality\ncaptions, we propose an innovative, automatic approach leveraging multimodal\ninputs, such as video frames, audio streams. Specifically, we construct a\nlarge-scale, high-quality, audio-language dataset, named as Auto-ACD,\ncomprising over 1.5M audio-text pairs. We exploit a series of pre-trained\nmodels or APIs, to determine audio-visual synchronisation, generate image\ncaptions, object detection, or audio tags for specific videos. Subsequently, we\nemploy LLM to paraphrase a congruent caption for each audio, guided by the\nextracted multi-modality clues. To demonstrate the effectiveness of the\nproposed dataset, we train widely used models on our dataset and show\nperformance improvement on various downstream tasks, for example,\naudio-language retrieval, audio captioning, zero-shot classification. In\naddition, we establish a novel benchmark with environmental information and\nprovide a benchmark for audio-text tasks.\n","authors":["Luoyi Sun","Xuenan Xu","Mengyue Wu","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2309.11500v4.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2409.05659v1","updated":"2024-09-09T14:29:22Z","published":"2024-09-09T14:29:22Z","title":"Audio-Visual Speaker Diarization: Current Databases, Approaches and\n Challenges","summary":" Nowadays, the large amount of audio-visual content available has fostered the\nneed to develop new robust automatic speaker diarization systems to analyse and\ncharacterise it. This kind of system helps to reduce the cost of doing this\nprocess manually and allows the use of the speaker information for different\napplications, as a huge quantity of information is present, for example, images\nof faces, or audio recordings. Therefore, this paper aims to address a critical\narea in the field of speaker diarization systems, the integration of\naudio-visual content of different domains. This paper seeks to push beyond\ncurrent state-of-the-art practices by developing a robust audio-visual speaker\ndiarization framework adaptable to various data domains, including TV\nscenarios, meetings, and daily activities. Unlike most of the existing\naudio-visual speaker diarization systems, this framework will also include the\nproposal of an approach to lead the precise assignment of specific identities\nin TV scenarios where celebrities appear. In addition, in this work, we have\nconducted an extensive compilation of the current state-of-the-art approaches\nand the existing databases for developing audio-visual speaker diarization.\n","authors":["Victoria Mingote","Alfonso Ortega","Antonio Miguel","Eduardo Lleida"],"pdf_url":"https://arxiv.org/pdf/2409.05659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03632v3","updated":"2024-09-09T12:26:04Z","published":"2024-08-07T08:43:58Z","title":"Concept Conductor: Orchestrating Multiple Personalized Concepts in\n Text-to-Image Synthesis","summary":" The customization of text-to-image models has seen significant advancements,\nyet generating multiple personalized concepts remains a challenging task.\nCurrent methods struggle with attribute leakage and layout confusion when\nhandling multiple concepts, leading to reduced concept fidelity and semantic\nconsistency. In this work, we introduce a novel training-free framework,\nConcept Conductor, designed to ensure visual fidelity and correct layout in\nmulti-concept customization. Concept Conductor isolates the sampling processes\nof multiple custom models to prevent attribute leakage between different\nconcepts and corrects erroneous layouts through self-attention-based spatial\nguidance. Additionally, we present a concept injection technique that employs\nshape-aware masks to specify the generation area for each concept. This\ntechnique injects the structure and appearance of personalized concepts through\nfeature fusion in the attention layers, ensuring harmony in the final image.\nExtensive qualitative and quantitative experiments demonstrate that Concept\nConductor can consistently generate composite images with accurate layouts\nwhile preserving the visual details of each concept. Compared to existing\nbaselines, Concept Conductor shows significant performance improvements. Our\nmethod supports the combination of any number of concepts and maintains high\nfidelity even when dealing with visually similar concepts. The code and models\nare available at https://github.com/Nihukat/Concept-Conductor.\n","authors":["Zebin Yao","Fangxiang Feng","Ruifan Li","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03632v3.pdf","comment":"Github Page: https://github.com/Nihukat/Concept-Conductor"},{"id":"http://arxiv.org/abs/2409.05540v1","updated":"2024-09-09T12:00:17Z","published":"2024-09-09T12:00:17Z","title":"Exploring Rich Subjective Quality Information for Image Quality\n Assessment in the Wild","summary":" Traditional in the wild image quality assessment (IQA) models are generally\ntrained with the quality labels of mean opinion score (MOS), while missing the\nrich subjective quality information contained in the quality ratings, for\nexample, the standard deviation of opinion scores (SOS) or even distribution of\nopinion scores (DOS). In this paper, we propose a novel IQA method named\nRichIQA to explore the rich subjective rating information beyond MOS to predict\nimage quality in the wild. RichIQA is characterized by two key novel designs:\n(1) a three-stage image quality prediction network which exploits the powerful\nfeature representation capability of the Convolutional vision Transformer (CvT)\nand mimics the short-term and long-term memory mechanisms of human brain; (2) a\nmulti-label training strategy in which rich subjective quality information like\nMOS, SOS and DOS are concurrently used to train the quality prediction network.\nPowered by these two novel designs, RichIQA is able to predict the image\nquality in terms of a distribution, from which the mean image quality can be\nsubsequently obtained. Extensive experimental results verify that the\nthree-stage network is tailored to predict rich quality information, while the\nmulti-label training strategy can fully exploit the potentials within\nsubjective quality rating and enhance the prediction performance and\ngeneralizability of the network. RichIQA outperforms state-of-the-art\ncompetitors on multiple large-scale in the wild IQA databases with rich\nsubjective rating labels. The code of RichIQA will be made publicly available\non GitHub.\n","authors":["Xiongkuo Min","Yixuan Gao","Yuqin Cao","Guangtao Zhai","Wenjun Zhang","Huifang Sun","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2409.05540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05496v1","updated":"2024-09-09T10:48:33Z","published":"2024-09-09T10:48:33Z","title":"Educational Virtual Field Trips based on Social VR and 360° Spaces","summary":" Virtual field trips (VFTs) have proven to be valuable learning tools. Such\napplications are mostly based on 360{\\deg} technology and are to be\ncharacterized as single-user applications in technological terms. In contrast,\nSocial VR applications are characterized by multi-user capability and\nuser-specific avatars. From a learning perspective, the concepts of\ncollaborative learning and embodiment have long been proposed as conducive to\nlearning. Both concepts might be supported using Social VR. However, little is\ncurrently known about the use of Social VR for VFTs. Accordingly, the research\nquestions are to what extent VFTs can be implemented in Social VR environments\nand how these Social VR-based VFTs are perceived by learners. This article\npresents an evaluation study on the development and evaluation of a VFT\nenvironment using the Social VR platform Mozilla Hubs. It describes the design\ndecisions to create the environment and evaluation results from a mixed-method\nstudy (N=16) using a questionnaire and focus group discussions. The study\nhighlighted the opportunities offered by Social VR-based VFTs but also revealed\nseveral challenges that need to be addressed to embrace the potential of Social\nVR-based VFTs to be utilized regularly in education.\n","authors":["Surya Kalvakolu","Heinrich Söbke","Jannicke Baalsrud Hauge","Eckhard Kraft"],"pdf_url":"https://arxiv.org/pdf/2409.05496v1.pdf","comment":"9 pages, 7 figures, 1 table, submitted to Games and Learning Alliance\n Conference"},{"id":"http://arxiv.org/abs/2409.05384v1","updated":"2024-09-09T07:32:18Z","published":"2024-09-09T07:32:18Z","title":"Look One and More: Distilling Hybrid Order Relational Knowledge for\n Cross-Resolution Image Recognition","summary":" In spite of great success in many image recognition tasks achieved by recent\ndeep models, directly applying them to recognize low-resolution images may\nsuffer from low accuracy due to the missing of informative details during\nresolution degradation. However, these images are still recognizable for\nsubjects who are familiar with the corresponding high-resolution ones. Inspired\nby that, we propose a teacher-student learning approach to facilitate\nlow-resolution image recognition via hybrid order relational knowledge\ndistillation. The approach refers to three streams: the teacher stream is\npretrained to recognize high-resolution images in high accuracy, the student\nstream is learned to identify low-resolution images by mimicking the teacher's\nbehaviors, and the extra assistant stream is introduced as bridge to help\nknowledge transfer across the teacher to the student. To extract sufficient\nknowledge for reducing the loss in accuracy, the learning of student is\nsupervised with multiple losses, which preserves the similarities in various\norder relational structures. In this way, the capability of recovering missing\ndetails of familiar low-resolution images can be effectively enhanced, leading\nto a better knowledge transfer. Extensive experiments on metric learning,\nlow-resolution image classification and low-resolution face recognition tasks\nshow the effectiveness of our approach, while taking reduced models.\n","authors":["Shiming Ge","Kangkai Zhang","Haolin Liu","Yingying Hua","Shengwei Zhao","Xin Jin","Hao Wen"],"pdf_url":"https://arxiv.org/pdf/2409.05384v1.pdf","comment":"Accepted by AAAI 2020"},{"id":"http://arxiv.org/abs/2409.05330v1","updated":"2024-09-09T05:20:02Z","published":"2024-09-09T05:20:02Z","title":"KAN-Based Fusion of Dual-Domain for Audio-Driven Facial Landmarks\n Generation","summary":" Audio-driven talking face generation is a widely researched topic due to its\nhigh applicability. Reconstructing a talking face using audio significantly\ncontributes to fields such as education, healthcare, online conversations,\nvirtual assistants, and virtual reality. Early studies often focused solely on\nchanging the mouth movements, which resulted in outcomes with limited practical\napplications. Recently, researchers have proposed a new approach of\nconstructing the entire face, including face pose, neck, and shoulders. To\nachieve this, they need to generate through landmarks. However, creating stable\nlandmarks that align well with the audio is a challenge. In this paper, we\npropose the KFusion of Dual-Domain model, a robust model that generates\nlandmarks from audio. We separate the audio into two distinct domains to learn\nemotional information and facial context, then use a fusion mechanism based on\nthe KAN model. Our model demonstrates high efficiency compared to recent\nmodels. This will lay the groundwork for the development of the audio-driven\ntalking face generation problem in the future.\n","authors":["Hoang-Son Vo-Thanh","Quang-Vinh Nguyen","Soo-Hyung Kim"],"pdf_url":"https://arxiv.org/pdf/2409.05330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05297v1","updated":"2024-09-09T03:10:40Z","published":"2024-09-09T03:10:40Z","title":"Adaptive Offloading and Enhancement for Low-Light Video Analytics on\n Mobile Devices","summary":" In this paper, we explore adaptive offloading and enhancement strategies for\nvideo analytics tasks on computing-constrained mobile devices in low-light\nconditions. We observe that the accuracy of low-light video analytics varies\nfrom different enhancement algorithms. The root cause could be the disparities\nin the effectiveness of enhancement algorithms for feature extraction in\nanalytic models. Specifically, the difference in class activation maps (CAMs)\nbetween enhanced and low-light frames demonstrates a positive correlation with\nvideo analytics accuracy. Motivated by such observations, a novel enhancement\nquality assessment method is proposed on CAMs to evaluate the effectiveness of\ndifferent enhancement algorithms for low-light videos. Then, we design a\nmulti-edge system, which adaptively offloads and enhances low-light video\nanalytics tasks from mobile devices. To achieve the trade-off between the\nenhancement quality and the latency for all system-served mobile devices, we\npropose a genetic-based scheduling algorithm, which can find a near-optimal\nsolution in a reasonable time to meet the latency requirement. Thereby, the\noffloading strategies and the enhancement algorithms are properly selected\nunder the condition of limited end-edge bandwidth and edge computation\nresources. Simulation experiments demonstrate the superiority of the proposed\nsystem, improving accuracy up to 20.83\\% compared to existing benchmarks.\n","authors":["Yuanyi He","Peng Yang","Tian Qin","Jiawei Hou","Ning Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14066v3","updated":"2024-09-09T03:06:21Z","published":"2024-07-19T06:50:24Z","title":"360VFI: A Dataset and Benchmark for Omnidirectional Video Frame\n Interpolation","summary":" Head-mounted 360{\\deg} displays and portable 360{\\deg} cameras have\nsignificantly progressed, providing viewers a realistic and immersive\nexperience. However, many omnidirectional videos have low frame rates that can\nlead to visual fatigue, and the prevailing plane frame interpolation\nmethodologies are unsuitable for omnidirectional video interpolation because\nthey are designed solely for traditional videos. This paper introduces the\nbenchmark dataset, 360VFI, for Omnidirectional Video Frame Interpolation. We\npresent a practical implementation that introduces a distortion prior from\nomnidirectional video into the network to modulate distortions. Specifically,\nwe propose a pyramid distortion-sensitive feature extractor that uses the\nunique characteristics of equirectangular projection (ERP) format as prior\ninformation. Moreover, we devise a decoder that uses an affine transformation\nto further facilitate the synthesis of intermediate frames. 360VFI is the first\ndataset and benchmark that explores the challenge of Omnidirectional Video\nFrame Interpolation. Through our benchmark analysis, we present four different\ndistortion condition scenes in the proposed 360VFI dataset to evaluate the\nchallenges triggered by distortion during interpolation. Besides, experimental\nresults demonstrate that Omnidirectional Video Interpolation can be effectively\nimproved by modeling for omnidirectional distortion.\n","authors":["Wenxuan Lu","Mengshun Hu","Yansheng Qiu","Liang Liao","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.14066v3.pdf","comment":"This is a preprint version"}]},"2024-09-08T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2405.19612v2","updated":"2024-09-08T20:32:47Z","published":"2024-05-30T02:00:03Z","title":"Keyword-driven Retrieval-Augmented Large Language Models for Cold-start\n User Recommendations","summary":" Recent advancements in Large Language Models (LLMs) have shown significant\npotential in enhancing recommender systems. However, addressing the cold-start\nrecommendation problem, where users lack historical data, remains a\nconsiderable challenge. In this paper, we introduce KALM4Rec (Keyword-driven\nRetrieval-Augmented Large Language Models for Cold-start User Recommendations),\na novel framework specifically designed to tackle this problem by requiring\nonly a few input keywords from users in a practical scenario of cold-start user\nrestaurant recommendations. KALM4Rec operates in two main stages: candidates\nretrieval and LLM-based candidates re-ranking. In the first stage,\nkeyword-driven retrieval models are used to identify potential candidates,\naddressing LLMs' limitations in processing extensive tokens and reducing the\nrisk of generating misleading information. In the second stage, we employ LLMs\nwith various prompting strategies, including zero-shot and few-shot techniques,\nto re-rank these candidates by integrating multiple examples directly into the\nLLM prompts. Our evaluation, using a Yelp restaurant dataset with user reviews\nfrom three English-speaking cities, shows that our proposed framework\nsignificantly improves recommendation quality. Specifically, the integration of\nin-context instructions with LLMs for re-ranking markedly enhances the\nperformance of the cold-start user recommender system.\n","authors":["Hai-Dang Kieu","Minh Duc Nguyen","Thanh-Son Nguyen","Dung D. Le"],"pdf_url":"https://arxiv.org/pdf/2405.19612v2.pdf","comment":"10 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.05152v1","updated":"2024-09-08T16:35:19Z","published":"2024-09-08T16:35:19Z","title":"OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs","summary":" Despite the recent advancements in Large Language Models (LLMs), which have\nsignificantly enhanced the generative capabilities for various NLP tasks, LLMs\nstill face limitations in directly handling retrieval tasks. However, many\npractical applications demand the seamless integration of both retrieval and\ngeneration. This paper introduces a novel and efficient One-pass Generation and\nretrieval framework (OneGen), designed to improve LLMs' performance on tasks\nthat require both generation and retrieval. The proposed framework bridges the\ntraditionally separate training approaches for generation and retrieval by\nincorporating retrieval tokens generated autoregressively. This enables a\nsingle LLM to handle both tasks simultaneously in a unified forward pass. We\nconduct experiments on two distinct types of composite tasks, RAG and Entity\nLinking, to validate the pluggability, effectiveness, and efficiency of OneGen\nin training and inference. Furthermore, our results show that integrating\ngeneration and retrieval within the same context preserves the generative\ncapabilities of LLMs while improving retrieval performance. To the best of our\nknowledge, OneGen is the first to enable LLMs to conduct vector retrieval\nduring the generation.\n","authors":["Jintian Zhang","Cheng Peng","Mengshu Sun","Xiang Chen","Lei Liang","Zhiqiang Zhang","Jun Zhou","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05152v1.pdf","comment":"Work in progress; code is available at\n https://github.com/zjunlp/OneGen"},{"id":"http://arxiv.org/abs/2409.05033v1","updated":"2024-09-08T08:57:12Z","published":"2024-09-08T08:57:12Z","title":"A Survey on Diffusion Models for Recommender Systems","summary":" While traditional recommendation techniques have made significant strides in\nthe past decades, they still suffer from limited generalization performance\ncaused by factors like inadequate collaborative signals, weak latent\nrepresentations, and noisy data. In response, diffusion models (DMs) have\nemerged as promising solutions for recommender systems due to their robust\ngenerative capabilities, solid theoretical foundations, and improved training\nstability. To this end, in this paper, we present the first comprehensive\nsurvey on diffusion models for recommendation, and draw a bird's-eye view from\nthe perspective of the whole pipeline in real-world recommender systems. We\nsystematically categorize existing research works into three primary domains:\n(1) diffusion for data engineering & encoding, focusing on data augmentation\nand representation enhancement; (2) diffusion as recommender models, employing\ndiffusion models to directly estimate user preferences and rank items; and (3)\ndiffusion for content presentation, utilizing diffusion models to generate\npersonalized content such as fashion and advertisement creatives. Our taxonomy\nhighlights the unique strengths of diffusion models in capturing complex data\ndistributions and generating high-quality, diverse samples that closely align\nwith user preferences. We also summarize the core characteristics of the\nadapting diffusion models for recommendation, and further identify key areas\nfor future exploration, which helps establish a roadmap for researchers and\npractitioners seeking to advance recommender systems through the innovative\napplication of diffusion models. To further facilitate the research community\nof recommender systems based on diffusion models, we actively maintain a GitHub\nrepository for papers and other related resources in this rising direction\nhttps://github.com/CHIANGEL/Awesome-Diffusion-for-RecSys.\n","authors":["Jianghao Lin","Jiaqi Liu","Jiachen Zhu","Yunjia Xi","Chengkai Liu","Yangtian Zhang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05033v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2409.05022v1","updated":"2024-09-08T08:27:22Z","published":"2024-09-08T08:27:22Z","title":"Sequential Recommendation via Adaptive Robust Attention with\n Multi-dimensional Embeddings","summary":" Sequential recommendation models have achieved state-of-the-art performance\nusing self-attention mechanism. It has since been found that moving beyond only\nusing item ID and positional embeddings leads to a significant accuracy boost\nwhen predicting the next item. In recent literature, it was reported that a\nmulti-dimensional kernel embedding with temporal contextual kernels to capture\nusers' diverse behavioral patterns results in a substantial performance\nimprovement. In this study, we further improve the sequential recommender\nmodel's robustness and generalization by introducing a mix-attention mechanism\nwith a layer-wise noise injection (LNI) regularization. We refer to our\nproposed model as adaptive robust sequential recommendation framework (ADRRec),\nand demonstrate through extensive experiments that our model outperforms\nexisting self-attention architectures.\n","authors":["Linsey Pang","Amir Hossein Raffiee","Wei Liu","Keld Lundgaard"],"pdf_url":"https://arxiv.org/pdf/2409.05022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11119v2","updated":"2024-09-08T04:25:32Z","published":"2024-04-17T07:07:41Z","title":"DREAM: A Dual Representation Learning Model for Multimodal\n Recommendation","summary":" Multimodal recommendation focuses primarily on effectively exploiting both\nbehavioral and multimodal information for the recommendation task. However,\nmost existing models suffer from the following issues when fusing information\nfrom two different domains: (1) Previous works do not pay attention to the\nsufficient utilization of modal information by only using direct concatenation,\naddition, or simple linear layers for modal information extraction. (2)\nPrevious works treat modal features as learnable embeddings, which causes the\nmodal embeddings to gradually deviate from the original modal features during\nlearning. We refer to this issue as Modal Information Forgetting. (3) Previous\napproaches fail to account for the significant differences in the distribution\nbetween behavior and modality, leading to the issue of representation\nmisalignment. To address these challenges, this paper proposes a novel Dual\nREpresentAtion learning model for Multimodal Recommendation called DREAM. For\nsufficient information extraction, we introduce separate dual lines, including\nBehavior Line and Modal Line, in which the Modal-specific Encoder is applied to\nempower modal representations. To address the issue of Modal Information\nForgetting, we introduce the Similarity Supervised Signal to constrain the\nmodal representations. Additionally, we design a Behavior-Modal Alignment\nmodule to fuse the dual representations through Intra-Alignment and\nInter-Alignment. Extensive experiments on three public datasets demonstrate\nthat the proposed DREAM method achieves state-of-the-art (SOTA) results. The\nsource code will be available upon acceptance.\n","authors":["Kangning Zhang","Yingjie Qin","Jiarui Jin","Yifan Liu","Ruilong Su","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.11119v2.pdf","comment":"10 pages, 11 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.03336v2","updated":"2024-09-08T14:21:13Z","published":"2024-09-05T08:28:36Z","title":"Estimating Indoor Scene Depth Maps from Ultrasonic Echoes","summary":" Measuring 3D geometric structures of indoor scenes requires dedicated depth\nsensors, which are not always available. Echo-based depth estimation has\nrecently been studied as a promising alternative solution. All previous studies\nhave assumed the use of echoes in the audible range. However, one major problem\nis that audible echoes cannot be used in quiet spaces or other situations where\nproducing audible sounds is prohibited. In this paper, we consider echo-based\ndepth estimation using inaudible ultrasonic echoes. While ultrasonic waves\nprovide high measurement accuracy in theory, the actual depth estimation\naccuracy when ultrasonic echoes are used has remained unclear, due to its\ndisadvantage of being sensitive to noise and susceptible to attenuation. We\nfirst investigate the depth estimation accuracy when the frequency of the sound\nsource is restricted to the high-frequency band, and found that the accuracy\ndecreased when the frequency was limited to ultrasonic ranges. Based on this\nobservation, we propose a novel deep learning method to improve the accuracy of\nultrasonic echo-based depth estimation by using audible echoes as auxiliary\ndata only during training. Experimental results with a public dataset\ndemonstrate that our method improves the estimation accuracy.\n","authors":["Junpei Honma","Akisato Kimura","Go Irie"],"pdf_url":"https://arxiv.org/pdf/2409.03336v2.pdf","comment":"ICIP 2024"},{"id":"http://arxiv.org/abs/2404.13306v2","updated":"2024-09-08T12:07:52Z","published":"2024-04-20T07:28:55Z","title":"FakeBench: Probing Explainable Fake Image Detection via Large Multimodal\n Models","summary":" The ability to distinguish whether an image is generated by artificial\nintelligence (AI) is a crucial ingredient in human intelligence, usually\naccompanied by a complex and dialectical forensic and reasoning process.\nHowever, current fake image detection models and databases focus on binary\nclassification without understandable explanations for the general populace.\nThis weakens the credibility of authenticity judgment and may conceal potential\nmodel biases. Meanwhile, large multimodal models (LMMs) have exhibited immense\nvisual-text capabilities on various tasks, bringing the potential for\nexplainable fake image detection. Therefore, we pioneer the probe of LMMs for\nexplainable fake image detection by presenting a multimodal database\nencompassing textual authenticity descriptions, the FakeBench. For\nconstruction, we first introduce a fine-grained taxonomy of generative visual\nforgery concerning human perception, based on which we collect forgery\ndescriptions in human natural language with a human-in-the-loop strategy.\nFakeBench examines LMMs with four evaluation criteria: detection, reasoning,\ninterpretation and fine-grained forgery analysis, to obtain deeper insights\ninto image authenticity-relevant capabilities. Experiments on various LMMs\nconfirm their merits and demerits in different aspects of fake image detection\ntasks. This research presents a paradigm shift towards transparency for the\nfake image detection area and reveals the need for greater emphasis on forensic\nelements in visual-language research and AI risk control. FakeBench will be\navailable at https://github.com/Yixuan423/FakeBench.\n","authors":["Yixuan Li","Xuelin Liu","Xiaoyang Wang","Bu Sung Lee","Shiqi Wang","Anderson Rocha","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2404.13306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04999v1","updated":"2024-09-08T07:08:58Z","published":"2024-09-08T07:08:58Z","title":"Visual Grounding with Multi-modal Conditional Adaptation","summary":" Visual grounding is the task of locating objects specified by natural\nlanguage expressions. Existing methods extend generic object detection\nframeworks to tackle this task. They typically extract visual and textual\nfeatures separately using independent visual and textual encoders, then fuse\nthese features in a multi-modal decoder for final prediction. However, visual\ngrounding presents unique challenges. It often involves locating objects with\ndifferent text descriptions within the same image. Existing methods struggle\nwith this task because the independent visual encoder produces identical visual\nfeatures for the same image, limiting detection performance. Some recently\napproaches propose various language-guided visual encoders to address this\nissue, but they mostly rely solely on textual information and require\nsophisticated designs. In this paper, we introduce Multi-modal Conditional\nAdaptation (MMCA), which enables the visual encoder to adaptively update\nweights, directing its focus towards text-relevant regions. Specifically, we\nfirst integrate information from different modalities to obtain multi-modal\nembeddings. Then we utilize a set of weighting coefficients, which generated\nfrom the multimodal embeddings, to reorganize the weight update matrices and\napply them to the visual encoder of the visual grounding model. Extensive\nexperiments on four widely used datasets demonstrate that MMCA achieves\nsignificant improvements and state-of-the-art results. Ablation experiments\nfurther demonstrate the lightweight and efficiency of our method. Our source\ncode is available at: https://github.com/Mr-Bigworth/MMCA.\n","authors":["Ruilin Yao","Shengwu Xiong","Yichen Zhao","Yi Rong"],"pdf_url":"https://arxiv.org/pdf/2409.04999v1.pdf","comment":"Accepted by ACM MM 2024 [Oral]"},{"id":"http://arxiv.org/abs/2404.11119v2","updated":"2024-09-08T04:25:32Z","published":"2024-04-17T07:07:41Z","title":"DREAM: A Dual Representation Learning Model for Multimodal\n Recommendation","summary":" Multimodal recommendation focuses primarily on effectively exploiting both\nbehavioral and multimodal information for the recommendation task. However,\nmost existing models suffer from the following issues when fusing information\nfrom two different domains: (1) Previous works do not pay attention to the\nsufficient utilization of modal information by only using direct concatenation,\naddition, or simple linear layers for modal information extraction. (2)\nPrevious works treat modal features as learnable embeddings, which causes the\nmodal embeddings to gradually deviate from the original modal features during\nlearning. We refer to this issue as Modal Information Forgetting. (3) Previous\napproaches fail to account for the significant differences in the distribution\nbetween behavior and modality, leading to the issue of representation\nmisalignment. To address these challenges, this paper proposes a novel Dual\nREpresentAtion learning model for Multimodal Recommendation called DREAM. For\nsufficient information extraction, we introduce separate dual lines, including\nBehavior Line and Modal Line, in which the Modal-specific Encoder is applied to\nempower modal representations. To address the issue of Modal Information\nForgetting, we introduce the Similarity Supervised Signal to constrain the\nmodal representations. Additionally, we design a Behavior-Modal Alignment\nmodule to fuse the dual representations through Intra-Alignment and\nInter-Alignment. Extensive experiments on three public datasets demonstrate\nthat the proposed DREAM method achieves state-of-the-art (SOTA) results. The\nsource code will be available upon acceptance.\n","authors":["Kangning Zhang","Yingjie Qin","Jiarui Jin","Yifan Liu","Ruilong Su","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.11119v2.pdf","comment":"10 pages, 11 figures"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..fcc82156 --- /dev/null +++ b/index.html @@ -0,0 +1,22601 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 57 + +
+
+
+ + ☆ Agents in Software Engineering: Survey, Landscape, and Vision + + +
+ In recent years, Large Language Models (LLMs) have achieved remarkable +success and have been widely used in various downstream tasks, especially in +the tasks of the software engineering (SE) field. We find that many studies +combining LLMs with SE have employed the concept of agents either explicitly or +implicitly. However, there is a lack of an in-depth survey to sort out the +development context of existing works, analyze how existing works combine the +LLM-based agent technologies to optimize various tasks, and clarify the +framework of LLM-based agents in SE. In this paper, we conduct the first survey +of the studies on combining LLM-based agents with SE and present a framework of +LLM-based agents in SE which includes three key modules: perception, memory, +and action. We also summarize the current challenges in combining the two +fields and propose future opportunities in response to existing challenges. We +maintain a GitHub repository of the related papers at: +https://github.com/DeepSoftwareAnalytics/Awesome-Agent4SE. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ AI-LieDar: Examine the Trade-off Between Utility and Truthfulness in LLM + Agents + + +
+ To be safely and successfully deployed, LLMs must simultaneously satisfy +truthfulness and utility goals. Yet, often these two goals compete (e.g., an AI +agent assisting a used car salesman selling a car with flaws), partly due to +ambiguous or misleading user instructions. We propose AI-LieDar, a framework to +study how LLM-based agents navigate scenarios with utility-truthfulness +conflicts in a multi-turn interactive setting. We design a set of realistic +scenarios where language agents are instructed to achieve goals that are in +conflict with being truthful during a multi-turn conversation with simulated +human agents. To evaluate the truthfulness at large scale, we develop a +truthfulness detector inspired by psychological literature to assess the +agents' responses. Our experiment demonstrates that all models are truthful +less than 50% of the time, although truthfulness and goal achievement (utility) +rates vary across models. We further test the steerability of LLMs towards +truthfulness, finding that models follow malicious instructions to deceive, and +even truth-steered models can still lie. These findings reveal the complex +nature of truthfulness in LLMs and underscore the importance of further +research to ensure the safe and reliable deployment of LLMs and AI agents. + +
+
+
+
+
+ + ☆ Optimizing Rare Word Accuracy in Direct Speech Translation with a + Retrieval-and-Demonstration Approach + + +
+ Direct speech translation (ST) models often struggle with rare words. +Incorrect translation of these words can have severe consequences, impacting +translation quality and user trust. While rare word translation is inherently +challenging for neural models due to sparse learning signals, real-world +scenarios often allow access to translations of past recordings on similar +topics. To leverage these valuable resources, we propose a +retrieval-and-demonstration approach to enhance rare word translation accuracy +in direct ST models. First, we adapt existing ST models to incorporate +retrieved examples for rare word translation, which allows the model to benefit +from prepended examples, similar to in-context learning. We then develop a +cross-modal (speech-to-speech, speech-to-text, text-to-text) retriever to +locate suitable examples. We demonstrate that standard ST models can be +effectively adapted to leverage examples for rare word translation, improving +rare word translation accuracy over the baseline by 17.6% with gold examples +and 8.5% with retrieved examples. Moreover, our speech-to-speech retrieval +approach outperforms other modalities and exhibits higher robustness to unseen +speakers. Our code is publicly available +(https://github.com/SiqiLii/Retrieve-and-Demonstration-ST). + +
+
+
+
+
+ + ☆ E2MoCase: A Dataset for Emotional, Event and Moral Observations in News + Articles on High-impact Legal Cases + + +
+ The way media reports on legal cases can significantly shape public opinion, +often embedding subtle biases that influence societal views on justice and +morality. Analyzing these biases requires a holistic approach that captures the +emotional tone, moral framing, and specific events within the narratives. In +this work we introduce E2MoCase, a novel dataset designed to facilitate the +integrated analysis of emotions, moral values, and events within legal +narratives and media coverage. By leveraging advanced models for emotion +detection, moral value identification, and event extraction, E2MoCase offers a +multi-dimensional perspective on how legal cases are portrayed in news +articles. + +
+
+
+
+
+ + ☆ Safeguarding Decentralized Social Media: LLM Agents for Automating + Community Rule Compliance + + +
+ Ensuring content compliance with community guidelines is crucial for +maintaining healthy online social environments. However, traditional +human-based compliance checking struggles with scaling due to the increasing +volume of user-generated content and a limited number of moderators. Recent +advancements in Natural Language Understanding demonstrated by Large Language +Models unlock new opportunities for automated content compliance verification. +This work evaluates six AI-agents built on Open-LLMs for automated rule +compliance checking in Decentralized Social Networks, a challenging environment +due to heterogeneous community scopes and rules. Analyzing over 50,000 posts +from hundreds of Mastodon servers, we find that AI-agents effectively detect +non-compliant content, grasp linguistic subtleties, and adapt to diverse +community contexts. Most agents also show high inter-rater reliability and +consistency in score justification and suggestions for compliance. Human-based +evaluation with domain experts confirmed the agents' reliability and +usefulness, rendering them promising tools for semi-automated or +human-in-the-loop content moderation systems. + +
+
+
+
+
+ + ☆ SynSUM -- Synthetic Benchmark with Structured and Unstructured Medical + Records + + +
+ We present the SynSUM benchmark, a synthetic dataset linking unstructured +clinical notes to structured background variables. The dataset consists of +10,000 artificial patient records containing tabular variables (like symptoms, +diagnoses and underlying conditions) and related notes describing the fictional +patient encounter in the domain of respiratory diseases. The tabular portion of +the data is generated through a Bayesian network, where both the causal +structure between the variables and the conditional probabilities are proposed +by an expert based on domain knowledge. We then prompt a large language model +(GPT-4o) to generate a clinical note related to this patient encounter, +describing the patient symptoms and additional context. The SynSUM dataset is +primarily designed to facilitate research on clinical information extraction in +the presence of tabular background variables, which can be linked through +domain knowledge to concepts of interest to be extracted from the text - the +symptoms, in the case of SynSUM. Secondary uses include research on the +automation of clinical reasoning over both tabular data and text, causal effect +estimation in the presence of tabular and/or textual confounders, and +multi-modal synthetic data generation. The dataset can be downloaded from +https://github.com/prabaey/SynSUM. + +
+
+
+
+
+ + ☆ Affective Computing Has Changed: The Foundation Model Disruption + + +
+ The dawn of Foundation Models has on the one hand revolutionised a wide range +of research problems, and, on the other hand, democratised the access and use +of AI-based tools by the general public. We even observe an incursion of these +models into disciplines related to human psychology, such as the Affective +Computing domain, suggesting their affective, emerging capabilities. In this +work, we aim to raise awareness of the power of Foundation Models in the field +of Affective Computing by synthetically generating and analysing multimodal +affective data, focusing on vision, linguistics, and speech (acoustics). We +also discuss some fundamental problems, such as ethical issues and regulatory +aspects, related to the use of Foundation Models in this research area. + +
+
+
+
+
+ + ☆ Visual Language Tracking with Multi-modal Interaction: A Robust + Benchmark + + +
+ Visual Language Tracking (VLT) enhances tracking by mitigating the +limitations of relying solely on the visual modality, utilizing high-level +semantic information through language. This integration of the language enables +more advanced human-machine interaction. The essence of interaction is +cognitive alignment, which typically requires multiple information exchanges, +especially in the sequential decision-making process of VLT. However, current +VLT benchmarks do not account for multi-round interactions during tracking. +They provide only an initial text and bounding box (bbox) in the first frame, +with no further interaction as tracking progresses, deviating from the original +motivation of the VLT task. To address these limitations, we propose a novel +and robust benchmark, VLT-MI (Visual Language Tracking with Multi-modal +Interaction), which introduces multi-round interaction into the VLT task for +the first time. (1) We generate diverse, multi-granularity texts for +multi-round, multi-modal interaction based on existing mainstream VLT +benchmarks using DTLLM-VLT, leveraging the world knowledge of LLMs. (2) We +propose a new VLT interaction paradigm that achieves multi-round interaction +through text updates and object recovery. When multiple tracking failures +occur, we provide the tracker with more aligned texts and corrected bboxes +through interaction, thereby expanding the scope of VLT downstream tasks. (3) +We conduct comparative experiments on both traditional VLT benchmarks and +VLT-MI, evaluating and analyzing the accuracy and robustness of trackers under +the interactive paradigm. This work offers new insights and paradigms for the +VLT task, enabling a fine-grained evaluation of multi-modal trackers. We +believe this approach can be extended to additional datasets in the future, +supporting broader evaluations and comparisons of video-language model +capabilities. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Exploring the Impact of Data Quantity on ASR in Extremely Low-resource + Languages + + +
+ This study investigates the efficacy of data augmentation techniques for +low-resource automatic speech recognition (ASR), focusing on two endangered +Austronesian languages, Amis and Seediq. Recognizing the potential of +self-supervised learning (SSL) in low-resource settings, we explore the impact +of data volume on the continued pre-training of SSL models. We propose a novel +data-selection scheme leveraging a multilingual corpus to augment the limited +target language data. This scheme utilizes a language classifier to extract +utterance embeddings and employs one-class classifiers to identify utterances +phonetically and phonologically proximate to the target languages. Utterances +are ranked and selected based on their decision scores, ensuring the inclusion +of highly relevant data in the SSL-ASR pipeline. Our experimental results +demonstrate the effectiveness of this approach, yielding substantial +improvements in ASR performance for both Amis and Seediq. These findings +underscore the feasibility and promise of data augmentation through +cross-lingual transfer learning for low-resource language ASR. + +
+
+
+
+
+ + ☆ FP-VEC: Fingerprinting Large Language Models via Efficient Vector + Addition + + +
+ Training Large Language Models (LLMs) requires immense computational power +and vast amounts of data. As a result, protecting the intellectual property of +these models through fingerprinting is essential for ownership authentication. +While adding fingerprints to LLMs through fine-tuning has been attempted, it +remains costly and unscalable. In this paper, we introduce FP-VEC, a pilot +study on using fingerprint vectors as an efficient fingerprinting method for +LLMs. Our approach generates a fingerprint vector that represents a +confidential signature embedded in the model, allowing the same fingerprint to +be seamlessly incorporated into an unlimited number of LLMs via vector +addition. Results on several LLMs show that FP-VEC is lightweight by running on +CPU-only devices for fingerprinting, scalable with a single training and +unlimited fingerprinting process, and preserves the model's normal behavior. +The project page is available at https://fingerprintvector.github.io . + +
+
+
+
+
+ + ☆ AIPO: Improving Training Objective for Iterative Preference Optimization + + +
+ Preference Optimization (PO), is gaining popularity as an alternative choice +of Proximal Policy Optimization (PPO) for aligning Large Language Models +(LLMs). Recent research on aligning LLMs iteratively with synthetic or +partially synthetic data shows promising results in scaling up PO training for +both academic settings and proprietary trained models such as Llama3. Despite +its success, our study shows that the length exploitation issue present in PO +is even more severe in Iterative Preference Optimization (IPO) due to the +iterative nature of the process. In this work, we study iterative preference +optimization with synthetic data. We share the findings and analysis along the +way of building the iterative preference optimization pipeline. More +specifically, we discuss the length exploitation issue during iterative +preference optimization and propose our training objective for iterative +preference optimization, namely Agreement-aware Iterative Preference +Optimization (AIPO). To demonstrate the effectiveness of our method, we conduct +comprehensive experiments and achieve state-of-the-art performance on MT-Bench, +AlpacaEval 2.0, and Arena-Hard. Our implementation and model checkpoints will +be made available at https://github.com/bytedance/AIPO. + +
+
+
+
+
+ + ☆ Your Weak LLM is Secretly a Strong Teacher for Alignment + + +
+ The burgeoning capabilities of large language models (LLMs) have underscored +the need for alignment to ensure these models act in accordance with human +values and intentions. Existing alignment frameworks present constraints either +in the form of expensive human effort or high computational costs. This paper +explores a promising middle ground, where we employ a weak LLM that is +significantly less resource-intensive than top-tier models, yet offers more +automation than purely human feedback. We present a systematic study to +evaluate and understand weak LLM's ability to generate feedback for alignment. +Our empirical findings demonstrate that weak LLMs can provide feedback that +rivals or even exceeds that of fully human-annotated data. Our study indicates +a minimized impact of model size on feedback efficacy, shedding light on a +scalable and sustainable alignment strategy. To deepen our understanding of +alignment under weak LLM feedback, we conduct a series of qualitative and +quantitative analyses, offering novel insights into the quality discrepancies +between human feedback vs. weak LLM feedback. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Exploring SSL Discrete Tokens for Multilingual ASR ICASSP 2025 + + +
+ With the advancement of Self-supervised Learning (SSL) in speech-related +tasks, there has been growing interest in utilizing discrete tokens generated +by SSL for automatic speech recognition (ASR), as they offer faster processing +techniques. However, previous studies primarily focused on multilingual ASR +with Fbank features or English ASR with discrete tokens, leaving a gap in +adapting discrete tokens for multilingual ASR scenarios. This study presents a +comprehensive comparison of discrete tokens generated by various leading SSL +models across multiple language domains. We aim to explore the performance and +efficiency of speech discrete tokens across multiple language domains for both +monolingual and multilingual ASR scenarios. Experimental results demonstrate +that discrete tokens achieve comparable results against systems trained on +Fbank features in ASR tasks across seven language domains with an average word +error rate (WER) reduction of 0.31% and 1.76% absolute (2.80% and 15.70% +relative) on dev and test sets respectively, with particularly WER reduction of +6.82% absolute (41.48% relative) on the Polish test set. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Exploring SSL Discrete Speech Features for Zipformer-based Contextual + ASR ICASSP 2025 + + +
+ Self-supervised learning (SSL) based discrete speech representations are +highly compact and domain adaptable. In this paper, SSL discrete speech +features extracted from WavLM models are used as additional cross-utterance +acoustic context features in Zipformer-Transducer ASR systems. The efficacy of +replacing Fbank features with discrete token features for modelling either +cross-utterance contexts (from preceding and future segments), or current +utterance's internal contexts alone, or both at the same time, are demonstrated +thoroughly on the Gigaspeech 1000-hr corpus. The best Zipformer-Transducer +system using discrete tokens based cross-utterance context features outperforms +the baseline using utterance internal context only with statistically +significant word error rate (WER) reductions of 0.32% to 0.41% absolute (2.78% +to 3.54% relative) on the dev and test data. The lowest published WER of 11.15% +and 11.14% were obtained on the dev and test sets. Our work is open-source and +publicly available at +https://github.com/open-creator/icefall/tree/master/egs/gigaspeech/Context\_ASR. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Optimizing Ingredient Substitution Using Large Language Models to + Enhance Phytochemical Content in Recipes + + +
+ In the emerging field of computational gastronomy, aligning culinary +practices with scientifically supported nutritional goals is increasingly +important. This study explores how large language models (LLMs) can be applied +to optimize ingredient substitutions in recipes, specifically to enhance the +phytochemical content of meals. Phytochemicals are bioactive compounds found in +plants, which, based on preclinical studies, may offer potential health +benefits. We fine-tuned models, including OpenAI's GPT-3.5, DaVinci, and Meta's +TinyLlama, using an ingredient substitution dataset. These models were used to +predict substitutions that enhance phytochemical content and create a +corresponding enriched recipe dataset. Our approach improved Hit@1 accuracy on +ingredient substitution tasks, from the baseline 34.53 plus-minus 0.10% to +38.03 plus-minus 0.28% on the original GISMo dataset, and from 40.24 plus-minus +0.36% to 54.46 plus-minus 0.29% on a refined version of the same dataset. These +substitutions led to the creation of 1,951 phytochemically enriched ingredient +pairings and 1,639 unique recipes. While this approach demonstrates potential +in optimizing ingredient substitutions, caution must be taken when drawing +conclusions about health benefits, as the claims are based on preclinical +evidence. Future work should include clinical validation and broader datasets +to further evaluate the nutritional impact of these substitutions. This +research represents a step forward in using AI to promote healthier eating +practices, providing potential pathways for integrating computational methods +with nutritional science. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Sign Language Sense Disambiguation + + +
+ This project explores methods to enhance sign language translation of German +sign language, specifically focusing on disambiguation of homonyms. Sign +language is ambiguous and understudied which is the basis for our experiments. +We approach the improvement by training transformer-based models on various +bodypart representations to shift the focus on said bodypart. To determine the +impact of, e.g., the hand or mouth representations, we experiment with +different combinations. The results show that focusing on the mouth increases +the performance in small dataset settings while shifting the focus on the hands +retrieves better results in larger dataset settings. Our results contribute to +better accessibility for non-hearing persons by improving the systems powering +digital assistants, enabling a more accurate interaction. The code for this +project can be found on GitHub. + +
+
+ comment: LIMO2024 @ KONVENS 2024, 8 pages, 3 figures +
+
+
+
+
+ + ☆ Journalists, Emotions, and the Introduction of Generative AI Chatbots: A + Large-Scale Analysis of Tweets Before and After the Launch of ChatGPT + + +
+ As part of a broader look at the impact of generative AI, this study +investigated the emotional responses of journalists to the release of ChatGPT +at the time of its launch. By analyzing nearly 1 million Tweets from +journalists at major U.S. news outlets, we tracked changes in emotional tone +and sentiment before and after the introduction of ChatGPT in November 2022. +Using various computational and natural language processing techniques to +measure emotional shifts in response to ChatGPT's release, we found an increase +in positive emotion and a more favorable tone post-launch, suggesting initial +optimism toward AI's potential. This research underscores the pivotal role of +journalists as interpreters of technological innovation and disruption, +highlighting how their emotional reactions may shape public narratives around +emerging technologies. The study contributes to understanding the intersection +of journalism, emotion, and AI, offering insights into the broader societal +impact of generative AI tools. + +
+
+
+
+
+ + ☆ Distilling Monolingual and Crosslingual Word-in-Context Representations + + +
+ In this study, we propose a method that distils representations of word +meaning in context from a pre-trained masked language model in both monolingual +and crosslingual settings. Word representations are the basis for context-aware +lexical semantics and unsupervised semantic textual similarity (STS) +estimation. Different from existing approaches, our method does not require +human-annotated corpora nor updates of the parameters of the pre-trained model. +The latter feature is appealing for practical scenarios where the off-the-shelf +pre-trained model is a common asset among different applications. Specifically, +our method learns to combine the outputs of different hidden layers of the +pre-trained model using self-attention. Our auto-encoder based training only +requires an automatically generated corpus. To evaluate the performance of the +proposed approach, we performed extensive experiments using various benchmark +tasks. The results on the monolingual tasks confirmed that our representations +exhibited a competitive performance compared to that of the previous study for +the context-aware lexical semantic tasks and outperformed it for STS +estimation. The results of the crosslingual tasks revealed that the proposed +method largely improved crosslingual word representations of multilingual +pre-trained models. + +
+
+
+
+
+ + ☆ Layerwise Change of Knowledge in Neural Networks + + +
+ This paper aims to explain how a deep neural network (DNN) gradually extracts +new knowledge and forgets noisy features through layers in forward propagation. +Up to now, although the definition of knowledge encoded by the DNN has not +reached a consensus, Previous studies have derived a series of mathematical +evidence to take interactions as symbolic primitive inference patterns encoded +by a DNN. We extend the definition of interactions and, for the first time, +extract interactions encoded by intermediate layers. We quantify and track the +newly emerged interactions and the forgotten interactions in each layer during +the forward propagation, which shed new light on the learning behavior of DNNs. +The layer-wise change of interactions also reveals the change of the +generalization capacity and instability of feature representations of a DNN. + +
+
+
+
+
+ + ☆ L3Cube-IndicQuest: A Benchmark Questing Answering Dataset for Evaluating + Knowledge of LLMs in Indic Context + + +
+ Large Language Models (LLMs) have made significant progress in incorporating +Indic languages within multilingual models. However, it is crucial to +quantitatively assess whether these languages perform comparably to globally +dominant ones, such as English. Currently, there is a lack of benchmark +datasets specifically designed to evaluate the regional knowledge of LLMs in +various Indic languages. In this paper, we present the L3Cube-IndicQuest, a +gold-standard question-answering benchmark dataset designed to evaluate how +well multilingual LLMs capture regional knowledge across various Indic +languages. The dataset contains 200 question-answer pairs, each for English and +19 Indic languages, covering five domains specific to the Indic region. We aim +for this dataset to serve as a benchmark, providing ground truth for evaluating +the performance of LLMs in understanding and representing knowledge relevant to +the Indian context. The IndicQuest can be used for both reference-based +evaluation and LLM-as-a-judge evaluation. The dataset is shared publicly at +https://github.com/l3cube-pune/indic-nlp . + +
+
+
+
+
+ + ☆ B4: Towards Optimal Assessment of Plausible Code Solutions with + Plausible Tests + + +
+ Selecting the best code solution from multiple generated ones is an essential +task in code generation, which can be achieved by using some reliable +validators (e.g., developer-written test cases) for assistance. Since reliable +test cases are not always available and can be expensive to build in practice, +researchers propose to automatically generate test cases to assess code +solutions. However, when both code solutions and test cases are plausible and +not reliable, selecting the best solution becomes challenging. Although some +heuristic strategies have been proposed to tackle this problem, they lack a +strong theoretical guarantee and it is still an open question whether an +optimal selection strategy exists. Our work contributes in two ways. First, we +show that within a Bayesian framework, the optimal selection strategy can be +defined based on the posterior probability of the observed passing states +between solutions and tests. The problem of identifying the best solution is +then framed as an integer programming problem. Second, we propose an efficient +approach for approximating this optimal (yet uncomputable) strategy, where the +approximation error is bounded by the correctness of prior knowledge. We then +incorporate effective prior knowledge to tailor code generation tasks. Both +theoretical and empirical studies confirm that existing heuristics are limited +in selecting the best solutions with plausible test cases. Our proposed +approximated optimal strategy B4 significantly surpasses existing heuristics in +selecting code solutions generated by large language models (LLMs) with +LLM-generated tests, achieving a relative performance improvement by up to 50% +over the strongest heuristic and 246% over the random selection in the most +challenging scenarios. Our code is publicly available at +https://github.com/ZJU-CTAG/B4. + +
+
+ comment: accepted by ASE' 24 (full paper) +
+
+
+
+
+ + ☆ NEST-RQ: Next Token Prediction for Speech Self-Supervised Pre-Training + + +
+ Speech self-supervised pre-training can effectively improve the performance +of downstream tasks. However, previous self-supervised learning (SSL) methods +for speech, such as HuBERT and BEST-RQ, focus on utilizing non-causal encoders +with bidirectional context, and lack sufficient support for downstream +streaming models. To address this issue, we introduce the next token prediction +based speech pre-training method with random-projection quantizer (NEST-RQ). +NEST-RQ employs causal encoders with only left context and uses next token +prediction (NTP) as the training task. On the large-scale dataset, compared to +BEST-RQ, the proposed NEST-RQ achieves comparable performance on non-streaming +automatic speech recognition (ASR) and better performance on streaming ASR. We +also conduct analytical experiments in terms of the future context size of +streaming ASR, the codebook quality of SSL and the model size of the encoder. +In summary, the paper demonstrates the feasibility of the NTP in speech SSL and +provides empirical evidence and insights for speech SSL research. + +
+
+ comment: 5 pages, 2 figures, Work in progress +
+
+
+
+
+ + ☆ Investigating Disentanglement in a Phoneme-level Speech Codec for + Prosody Modeling + + +
+ Most of the prevalent approaches in speech prosody modeling rely on learning +global style representations in a continuous latent space which encode and +transfer the attributes of reference speech. However, recent work on neural +codecs which are based on Residual Vector Quantization (RVQ) already shows +great potential offering distinct advantages. We investigate the prosody +modeling capabilities of the discrete space of such an RVQ-VAE model, modifying +it to operate on the phoneme-level. We condition both the encoder and decoder +of the model on linguistic representations and apply a global speaker embedding +in order to factor out both phonetic and speaker information. We conduct an +extensive set of investigations based on subjective experiments and objective +measures to show that the phoneme-level discrete latent representations +obtained this way achieves a high degree of disentanglement, capturing +fine-grained prosodic information that is robust and transferable. The latent +space turns out to have interpretable structure with its principal components +corresponding to pitch and energy. + +
+
+
+
+
+ + ☆ LA-RAG:Enhancing LLM-based ASR Accuracy with Retrieval-Augmented + Generation ICASSP 2025 + + +
+ Recent advancements in integrating speech information into large language +models (LLMs) have significantly improved automatic speech recognition (ASR) +accuracy. However, existing methods often constrained by the capabilities of +the speech encoders under varied acoustic conditions, such as accents. To +address this, we propose LA-RAG, a novel Retrieval-Augmented Generation (RAG) +paradigm for LLM-based ASR. LA-RAG leverages fine-grained token-level speech +datastores and a speech-to-speech retrieval mechanism to enhance ASR accuracy +via LLM in-context learning (ICL) capabilities. Experiments on Mandarin and +various Chinese dialect datasets demonstrate significant improvements in ASR +accuracy compared to existing methods, validating the effectiveness of our +approach, especially in handling accent variations. + +
+
+ comment: submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Large Language Model Can Transcribe Speech in Multi-Talker Scenarios + with Versatile Instructions + + +
+ Recent advancements in large language models (LLMs) have revolutionized +various domains, bringing significant progress and new opportunities. Despite +progress in speech-related tasks, LLMs have not been sufficiently explored in +multi-talker scenarios. In this work, we present a pioneering effort to +investigate the capability of LLMs in transcribing speech in multi-talker +environments, following versatile instructions related to multi-talker +automatic speech recognition (ASR), target talker ASR, and ASR based on +specific talker attributes such as sex, occurrence order, language, and keyword +spoken. Our approach utilizes WavLM and Whisper encoder to extract +multi-faceted speech representations that are sensitive to speaker +characteristics and semantic context. These representations are then fed into +an LLM fine-tuned using LoRA, enabling the capabilities for speech +comprehension and transcription. Comprehensive experiments reveal the promising +performance of our proposed system, MT-LLM, in cocktail party scenarios, +highlighting the potential of LLM to handle speech-related tasks based on user +instructions in such complex settings. + +
+
+
+
+
+ + ☆ Cracking the Code: Multi-domain LLM Evaluation on Real-World + Professional Exams in Indonesia + + +
+ While knowledge evaluation in large language models has predominantly focused +on academic subjects like math and physics, these assessments often fail to +capture the practical demands of real-world professions. In this paper, we +introduce IndoCareer, a dataset comprising 8,834 multiple-choice questions +designed to evaluate performance in vocational and professional certification +exams across various fields. With a focus on Indonesia, IndoCareer provides +rich local contexts, spanning six key sectors: (1) healthcare, (2) insurance +and finance, (3) creative and design, (4) tourism and hospitality, (5) +education and training, and (6) law. Our comprehensive evaluation of 27 large +language models shows that these models struggle particularly in fields with +strong local contexts, such as insurance and finance. Additionally, while using +the entire dataset, shuffling answer options generally maintains consistent +evaluation results across models, but it introduces instability specifically in +the insurance and finance sectors. + +
+
+
+
+
+ + ☆ Expediting and Elevating Large Language Model Reasoning via Hidden + Chain-of-Thought Decoding + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities in +tasks requiring reasoning and multi-step problem-solving through the use of +chain-of-thought (CoT) prompting. However, generating the full CoT process +results in significantly longer output sequences, leading to increased +computational costs and latency during inference. To address this challenge, we +propose a novel approach to compress the CoT process through semantic +alignment, enabling more efficient decoding while preserving the benefits of +CoT reasoning. Our method introduces an auxiliary CoT model that learns to +generate and compress the full thought process into a compact special token +representation semantically aligned with the original CoT output. This +compressed representation is then integrated into the input of the Hidden +Chain-of-Thought (HCoT) model. The training process follows a two-stage +procedure: First, the CoT model is optimized to generate the compressed token +representations aligned with the ground-truth CoT outputs using a contrastive +loss. Subsequently, with the CoT model parameters frozen, the HCoT model is +fine-tuned to generate accurate subsequent predictions conditioned on the +prefix instruction and the compressed CoT representations from the CoT model. +Extensive experiments across three challenging domains - mathematical +reasoning, agent invocation, and question answering - demonstrate that our +semantic compression approach achieves competitive or improved performance +compared to the full CoT baseline, while providing significant speedups of at +least 1.5x in decoding time. Moreover, incorporating contrastive learning +objectives further enhances the quality of the compressed representations, +leading to better CoT prompting and improved task accuracy. Our work paves the +way for more efficient exploitation of multi-step reasoning capabilities in +LLMs across a wide range of applications. + +
+
+
+
+
+ + ☆ LLM-Powered Grapheme-to-Phoneme Conversion: Benchmark and Case Study + + +
+ Grapheme-to-phoneme (G2P) conversion is critical in speech processing, +particularly for applications like speech synthesis. G2P systems must possess +linguistic understanding and contextual awareness of languages with polyphone +words and context-dependent phonemes. Large language models (LLMs) have +recently demonstrated significant potential in various language tasks, +suggesting that their phonetic knowledge could be leveraged for G2P. In this +paper, we evaluate the performance of LLMs in G2P conversion and introduce +prompting and post-processing methods that enhance LLM outputs without +additional training or labeled data. We also present a benchmarking dataset +designed to assess G2P performance on sentence-level phonetic challenges of the +Persian language. Our results show that by applying the proposed methods, LLMs +can outperform traditional G2P tools, even in an underrepresented language like +Persian, highlighting the potential of developing LLM-aided G2P systems. + +
+
+ comment: 5 pages, 5 figures +
+
+
+
+
+ + ☆ Eir: Thai Medical Large Language Models + + +
+ We present Eir Thai Medical LLM, a large language model with 8 billion +parameters, specifically designed to enhance the accuracy of handling medical +tasks in the Thai language. This model focuses on providing clear and +easy-to-understand answers for both healthcare professionals and patients, +thereby improving the efficiency of diagnosis and treatment processes. Human +evaluation was conducted to ensure that the model adheres to care standards and +provides unbiased answers. + To prioritize data security, the model is deployed within the hospital's +internal network, ensuring both high security and faster processing speeds. The +internal API connection is secured with encryption and strict authentication +measures to prevent data leaks and unauthorized access. + We evaluated several open-source large language models with 8 billion +parameters on four medical benchmarks: MedQA, MedMCQA, PubMedQA, and the +medical subset of MMLU. The best-performing baselines were used to develop Eir +Thai Medical LLM. Our evaluation employed multiple questioning strategies, +including zero-shot, few-shot, chain-of-thought reasoning, and +ensemble/self-consistency voting methods. Our model outperformed commercially +available Thai-language large language models by more than 10%. In addition, we +developed enhanced model testing tailored for clinical use in Thai across 18 +clinical tasks, where our model exceeded GPT-4o performance by more than 11% + +
+
+
+
+
+ + ☆ MAPX: An explainable model-agnostic framework for the detection of false + information on social media networks + + +
+ The automated detection of false information has become a fundamental task in +combating the spread of "fake news" on online social media networks (OSMN) as +it reduces the need for manual discernment by individuals. In the literature, +leveraging various content or context features of OSMN documents have been +found useful. However, most of the existing detection models often utilise +these features in isolation without regard to the temporal and dynamic changes +oft-seen in reality, thus, limiting the robustness of the models. Furthermore, +there has been little to no consideration of the impact of the quality of +documents' features on the trustworthiness of the final prediction. In this +paper, we introduce a novel model-agnostic framework, called MAPX, which allows +evidence based aggregation of predictions from existing models in an +explainable manner. Indeed, the developed aggregation method is adaptive, +dynamic and considers the quality of OSMN document features. Further, we +perform extensive experiments on benchmarked fake news datasets to demonstrate +the effectiveness of MAPX using various real-world data quality scenarios. Our +empirical results show that the proposed framework consistently outperforms all +state-of-the-art models evaluated. For reproducibility, a demo of MAPX is +available at \href{https://github.com/SCondran/MAPX_framework}{this link} + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ A BERT-Based Summarization approach for depression detection + + +
+ Depression is a globally prevalent mental disorder with potentially severe +repercussions if not addressed, especially in individuals with recurrent +episodes. Prior research has shown that early intervention has the potential to +mitigate or alleviate symptoms of depression. However, implementing such +interventions in a real-world setting may pose considerable challenges. A +promising strategy involves leveraging machine learning and artificial +intelligence to autonomously detect depression indicators from diverse data +sources. One of the most widely available and informative data sources is text, +which can reveal a person's mood, thoughts, and feelings. In this context, +virtual agents programmed to conduct interviews using clinically validated +questionnaires, such as those found in the DAIC-WOZ dataset, offer a robust +means for depression detection through linguistic analysis. Utilizing +BERT-based models, which are powerful and versatile yet use fewer resources +than contemporary large language models, to convert text into numerical +representations significantly enhances the precision of depression diagnosis. +These models adeptly capture complex semantic and syntactic nuances, improving +the detection accuracy of depressive symptoms. Given the inherent limitations +of these models concerning text length, our study proposes text summarization +as a preprocessing technique to diminish the length and intricacies of input +texts. Implementing this method within our uniquely developed framework for +feature extraction and classification yielded an F1-score of 0.67 on the test +set surpassing all prior benchmarks and 0.81 on the validation set exceeding +most previous results on the DAIC-WOZ dataset. Furthermore, we have devised a +depression lexicon to assess summary quality and relevance. This lexicon +constitutes a valuable asset for ongoing research in depression detection. + +
+
+
+
+
+ + ☆ Explaining Datasets in Words: Statistical Models with Natural Language + Parameters + + +
+ To make sense of massive data, we often fit simplified models and then +interpret the parameters; for example, we cluster the text embeddings and then +interpret the mean parameters of each cluster. However, these parameters are +often high-dimensional and hard to interpret. To make model parameters directly +interpretable, we introduce a family of statistical models -- including +clustering, time series, and classification models -- parameterized by natural +language predicates. For example, a cluster of text about COVID could be +parameterized by the predicate "discusses COVID". To learn these statistical +models effectively, we develop a model-agnostic algorithm that optimizes +continuous relaxations of predicate parameters with gradient descent and +discretizes them by prompting language models (LMs). Finally, we apply our +framework to a wide range of problems: taxonomizing user chat dialogues, +characterizing how they evolve across time, finding categories where one +language model is better than the other, clustering math problems based on +subareas, and explaining visual features in memorable images. Our framework is +highly versatile, applicable to both textual and visual domains, can be easily +steered to focus on specific properties (e.g. subareas), and explains +sophisticated concepts that classical methods (e.g. n-gram analysis) struggle +to produce. + +
+
+
+
+
+ + ☆ When Context Leads but Parametric Memory Follows in Large Language + Models + + +
+ Large language models (LLMs) have demonstrated remarkable progress in +leveraging diverse knowledge sources. This study investigates how nine widely +used LLMs allocate knowledge between local context and global parameters when +answering open-ended questions in knowledge-consistent scenarios. We introduce +a novel dataset, WikiAtomic, and systematically vary context sizes to analyze +how LLMs prioritize and utilize the provided information and their parametric +knowledge in knowledge-consistent scenarios. Additionally, we also study their +tendency to hallucinate under varying context sizes. Our findings reveal +consistent patterns across models, including a consistent reliance on both +contextual (around 70%) and parametric (around 30%) knowledge, and a decrease +in hallucinations with increasing context. These insights highlight the +importance of more effective context organization and developing models that +use input more deterministically for robust performance. + +
+
+
+
+
+ + ♻ ☆ Motion Capture Analysis of Verb and Adjective Types in Austrian Sign + Language + + +
+ Across a number of sign languages, temporal and spatial characteristics of +dominant hand articulation are used to express semantic and grammatical +features. In this study of Austrian Sign Language (\"Osterreichische +Geb\"ardensprache, or \"OGS), motion capture data of four Deaf signers is used +to quantitatively characterize the kinematic parameters of sign production in +verbs and adjectives. We investigate (1) the difference in production between +verbs involving a natural endpoint (telic verbs; e.g. arrive) and verbs lacking +an endpoint (atelic verbs; e.g. analyze), and (2) adjective signs in +intensified vs. non-intensified (plain) forms. Motion capture data analysis +using linear-mixed effects models (LME) indicates that both the endpoint +marking in verbs, as well as marking of intensification in adjectives, are +expressed by movement modulation in \"OGS. While the semantic distinction +between verb types (telic/atelic) is marked by higher peak velocity and shorter +duration for telic signs compared to atelic ones, the grammatical distinction +(intensification) in adjectives is expressed by longer duration for intensified +compared to non-intensified adjectives. The observed individual differences of +signers might be interpreted as personal signing style. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Pseudo-Prompt Generating in Pre-trained Vision-Language Models for + Multi-Label Medical Image Classification + + +
+ The task of medical image recognition is notably complicated by the presence +of varied and multiple pathological indications, presenting a unique challenge +in multi-label classification with unseen labels. This complexity underlines +the need for computer-aided diagnosis methods employing multi-label zero-shot +learning. Recent advancements in pre-trained vision-language models (VLMs) have +showcased notable zero-shot classification abilities on medical images. +However, these methods have limitations on leveraging extensive pre-trained +knowledge from broader image datasets, and often depend on manual prompt +construction by expert radiologists. By automating the process of prompt +tuning, prompt learning techniques have emerged as an efficient way to adapt +VLMs to downstream tasks. Yet, existing CoOp-based strategies fall short in +performing class-specific prompts on unseen categories, limiting +generalizability in fine-grained scenarios. To overcome these constraints, we +introduce a novel prompt generation approach inspirited by text generation in +natural language processing (NLP). Our method, named Pseudo-Prompt Generating +(PsPG), capitalizes on the priori knowledge of multi-modal features. Featuring +a RNN-based decoder, PsPG autoregressively generates class-tailored embedding +vectors, i.e., pseudo-prompts. Comparative evaluations on various multi-label +chest radiograph datasets affirm the superiority of our approach against +leading medical vision-language and multi-label prompt learning methods. The +source code is available at https://github.com/fallingnight/PsPG + +
+
+ comment: Accepted by PRCV 2024 +
+
+
+
+
+ + ♻ ☆ Natural Language Processing with Commonsense Knowledge: A Survey + + +
+ Commonsense knowledge is essential for advancing natural language processing +(NLP) by enabling models to engage in human-like reasoning, which requires a +deeper understanding of context and often involves making inferences based on +implicit external knowledge. This paper explores the integration of commonsense +knowledge into various NLP tasks. We begin by reviewing prominent commonsense +knowledge bases and then discuss the benchmarks used to evaluate the +commonsense reasoning capabilities of NLP models, particularly language models. +Furthermore, we highlight key methodologies for incorporating commonsense +knowledge and their applications across different NLP tasks. The paper also +examines the challenges and emerging trends in enhancing NLP systems with +commonsense reasoning. All literature referenced in this survey can be accessed +via our GitHub repository: https://github.com/yuboxie/awesome-commonsense. + +
+
+ comment: 20 pages, 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Understanding How CodeLLMs (Mis)Predict Types with Activation Steering + + +
+ CodeLLMs are transforming software development as we know it. This is +especially true for tasks where rule-based approaches fall short, like type +prediction. The type prediction task consists in adding a new type annotation +to a partially typed program, such that the resulting program is closer to +being fully typed. The intractability of rule-based approaches and high cost of +manual annotation make CodeLLMs an attractive solution to the problem. However, +CodeLLMs are still far from being deployed on the large-scale due to doubts +surrounding their reliability. + To shed some light on how CodeLLMs approach type prediction, we investigate +what happens when a model mispredicts a type. We show that by applying +semantics-preserving edits to code, CodeLLMs are eventually misled into +mispredicting type annotations. However, by leveraging activation steering we +are able to "steer" the model back to the correct prediction, making models +more robust against semantically irrelevant prompt features. We show that +steering achieves comparable performance to fine-tuning directly on the type +prediction task. Furthermore, we find that steering vectors computed from +Python code are effective at correcting TypeScript mispredictions, and vice +versa. To our knowledge, this is the first evidence of its kind to suggest that +CodeLLMs learn task representations that transfer across languages. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ An Automatic Quality Metric for Evaluating Simultaneous Interpretation + + +
+ Simultaneous interpretation (SI), the translation of one language to another +in real time, starts translation before the original speech has finished. Its +evaluation needs to consider both latency and quality. This trade-off is +challenging especially for distant word order language pairs such as English +and Japanese. To handle this word order gap, interpreters maintain the word +order of the source language as much as possible to keep up with original +language to minimize its latency while maintaining its quality, whereas in +translation reordering happens to keep fluency in the target language. This +means outputs synchronized with the source language are desirable based on the +real SI situation, and it's a key for further progress in computational SI and +simultaneous machine translation (SiMT). In this work, we propose an automatic +evaluation metric for SI and SiMT focusing on word order synchronization. Our +evaluation metric is based on rank correlation coefficients, leveraging +cross-lingual pre-trained language models. Our experimental results on +NAIST-SIC-Aligned and JNPC showed our metrics' effectiveness to measure word +order synchronization between source and target language. + +
+
+
+
+
+ + ♻ ☆ Combining Data Generation and Active Learning for Low-Resource Question + Answering ICANN 2024 + + +
+ Neural approaches have become very popular in Question Answering (QA), +however, they require a large amount of annotated data. In this work, we +propose a novel approach that combines data augmentation via question-answer +generation with Active Learning to improve performance in low-resource +settings, where the target domains are diverse in terms of difficulty and +similarity to the source domain. We also investigate Active Learning for +question answering in different stages, overall reducing the annotation effort +of humans. For this purpose, we consider target domains in realistic settings, +with an extremely low amount of annotated samples but with many unlabeled +documents, which we assume can be obtained with little effort. Additionally, we +assume a sufficient amount of labeled data from the source domain being +available. We perform extensive experiments to find the best setup for +incorporating domain experts. Our findings show that our novel approach, where +humans are incorporated in a data generation approach, boosts performance in +the low-resource, domain-specific setting, allowing for low-labeling-effort +question answering systems in new, specialized domains. They further +demonstrate how human annotation affects the performance of QA depending on the +stage it is performed. + +
+
+ comment: ICANN 2024 +
+
+
+
+
+ + ♻ ☆ Predicting Sentence-Level Factuality of News and Bias of Media Outlets + + +
+ Automated news credibility and fact-checking at scale require accurately +predicting news factuality and media bias. This paper introduces a large +sentence-level dataset, titled "FactNews", composed of 6,191 sentences expertly +annotated according to factuality and media bias definitions proposed by +AllSides. We use FactNews to assess the overall reliability of news sources, by +formulating two text classification problems for predicting sentence-level +factuality of news reporting and bias of media outlets. Our experiments +demonstrate that biased sentences present a higher number of words compared to +factual sentences, besides having a predominance of emotions. Hence, the +fine-grained analysis of subjectivity and impartiality of news articles +provided promising results for predicting the reliability of media outlets. +Finally, due to the severity of fake news and political polarization in Brazil, +and the lack of research for Portuguese, both dataset and baseline were +proposed for Brazilian Portuguese. + +
+
+ comment: Proceedings of the 14th International Conference on Recent Advances + in Natural Language Processing (RANLP 2023). + https://aclanthology.org/2023.ranlp-1.127 +
+
+
+
+
+ + ♻ ☆ Performance Law of Large Language Models + + +
+ Guided by the belief of the scaling law, large language models (LLMs) have +achieved impressive performance in recent years. However, scaling law only +gives a qualitative estimation of loss, which is influenced by various factors +such as model architectures, data distributions, tokenizers, and computation +precision. Thus, estimating the real performance of LLMs with different +training settings rather than loss may be quite useful in practical +development. In this article, we present an empirical equation named +"Performance Law" to directly predict the MMLU score of an LLM, which is a +widely used metric to indicate the general capability of LLMs in real-world +conversations and applications. Based on only a few key hyperparameters of the +LLM architecture and the size of training data, we obtain a quite accurate MMLU +prediction of various LLMs with diverse sizes and architectures developed by +different organizations in different years. Performance law can be used to +guide the choice of LLM architecture and the effective allocation of +computational resources without extensive experiments. + +
+
+ comment: Personal opinions of the authors +
+
+
+
+
+ + ♻ ☆ The Design of Informative Take-Over Requests for Semi-Autonomous + Cyber-Physical Systems: Combining Spoken Language and Visual Icons in a + Drone-Controller Setting + + +
+ The question of how cyber-physical systems should interact with human +partners that can take over control or exert oversight is becoming more +pressing, as these systems are deployed for an ever larger range of tasks. +Drawing on the literatures on handing over control during semi-autonomous +driving and human-robot interaction, we propose a design of a take-over request +that combines an abstract pre-alert with an informative TOR: Relevant sensor +information is highlighted on the controller's display, while a spoken message +verbalizes the reason for the TOR. We conduct our study in the context of a +semi-autonomous drone control scenario as our testbed. The goal of our online +study is to assess in more detail what form a language-based TOR should take. +Specifically, we compare a full sentence condition to shorter fragments, and +test whether the visual highlighting should be done synchronously or +asynchronously with the speech. Participants showed a higher accuracy in +choosing the correct solution with our bi-modal TOR and felt that they were +better able to recognize the critical situation. Using only fragments in the +spoken message rather than full sentences did not lead to improved accuracy or +faster reactions. Also, synchronizing the visual highlighting with the spoken +message did not result in better accuracy and response times were even +increased in this condition. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model + + +
+ In this study, we introduce CT-LLM, a 2B large language model (LLM) that +illustrates a pivotal shift towards prioritizing the Chinese language in +developing LLMs. Uniquely initiated from scratch, CT-LLM diverges from the +conventional methodology by primarily incorporating Chinese textual data, +utilizing an extensive corpus of 1,200 billion tokens, including 800 billion +Chinese tokens, 300 billion English tokens, and 100 billion code tokens. This +strategic composition facilitates the model's exceptional proficiency in +understanding and processing Chinese, a capability further enhanced through +alignment techniques. Demonstrating remarkable performance on the CHC-Bench, +CT-LLM excels in Chinese language tasks, and showcases its adeptness in English +through SFT. This research challenges the prevailing paradigm of training LLMs +predominantly on English corpora and then adapting them to other languages, +broadening the horizons for LLM training methodologies. By open-sourcing the +full process of training a Chinese LLM, including a detailed data processing +procedure with the obtained Massive Appropriate Pretraining Chinese Corpus +(MAP-CC), a well-chosen multidisciplinary Chinese Hard Case Benchmark +(CHC-Bench), and the 2B-size Chinese Tiny LLM (CT-LLM), we aim to foster +further exploration and innovation in both academia and industry, paving the +way for more inclusive and versatile language models. + +
+
+
+
+
+ + ♻ ☆ RRWKV: Capturing Long-range Dependencies in RWKV + + +
+ Owing to the impressive dot-product attention, the Transformers have been the +dominant architectures in various natural language processing (NLP) tasks. +Recently, the Receptance Weighted Key Value (RWKV) architecture follows a +non-transformer architecture to eliminate the drawbacks of dot-product +attention, where memory and computational complexity exhibits quadratic scaling +with sequence length. Although RWKV has exploited a linearly tensor-product +attention mechanism and achieved parallelized computations by deploying the +time-sequential mode, it fails to capture long-range dependencies because of +its limitation on looking back at previous information, compared with full +information obtained by direct interactions in the standard transformer. +Therefore, the paper devises the Retrospected Receptance Weighted Key Value +(RRWKV) architecture via incorporating the retrospecting ability into the RWKV +to effectively absorb information, which maintains memory and computational +efficiency as well. + +
+
+ comment: Upon further review, the authors have determined that the conclusions + presented in the paper are no longer valid or contain errors. As a result, we + have decided to withdraw the paper to avoid the spread of incorrect findings +
+
+
+
+
+ + ♻ ☆ CodeJudge-Eval: Can Large Language Models be Good Judges in Code + Understanding? + + +
+ Recent advancements in large language models (LLMs) have showcased impressive +code generation capabilities, primarily evaluated through language-to-code +benchmarks. However, these benchmarks may not fully capture a model's code +understanding abilities. We introduce CodeJudge-Eval (CJ-Eval), a novel +benchmark designed to assess LLMs' code understanding abilities from the +perspective of code judging rather than code generation. CJ-Eval challenges +models to determine the correctness of provided code solutions, encompassing +various error types and compilation issues. By leveraging a diverse set of +problems and a fine-grained judging system, CJ-Eval addresses the limitations +of traditional benchmarks, including the potential memorization of solutions. +Evaluation of 12 well-known LLMs on CJ-Eval reveals that even state-of-the-art +models struggle, highlighting the benchmark's ability to probe deeper into +models' code understanding abilities. Our codes and benchmark are available at +\url{https://github.com/CodeLLM-Research/CodeJudge-Eval}. + +
+
+ comment: The first two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Sub-SA: Strengthen In-context Learning via Submodular Selective + Annotation ECAI 2024 + + +
+ In-context learning (ICL) leverages in-context examples as prompts for the +predictions of Large Language Models (LLMs). These prompts play a crucial role +in achieving strong performance. However, the selection of suitable prompts +from a large pool of labeled examples often entails significant annotation +costs. To address this challenge, we propose Sub-SA (Submodular Selective +Annotation), a submodule-based selective annotation method. The aim of Sub-SA +is to reduce annotation costs while improving the quality of in-context +examples and minimizing the time consumption of the selection process. In +Sub-SA, we design a submodular function that facilitates effective subset +selection for annotation and demonstrates the characteristics of monotonically +and submodularity from the theoretical perspective. Specifically, we propose +RPR (Reward and Penalty Regularization) to better balance the diversity and +representativeness of the unlabeled dataset attributed to a reward term and a +penalty term, respectively. Consequently, the selection for annotations can be +effectively addressed with a simple yet effective greedy search algorithm based +on the submodular function. Finally, we apply the similarity prompt retrieval +to get the examples for ICL. + +
+
+ comment: Accepted by ECAI 2024 +
+
+
+
+
+ + ♻ ☆ DOCE: Finding the Sweet Spot for Execution-Based Code Generation + + +
+ Recently, a diverse set of decoding and reranking procedures have been shown +effective for LLM-based code generation. However, a comprehensive framework +that links and experimentally compares these methods is missing. We address +this by proposing Decoding Objectives for Code Execution, a comprehensive +framework that includes candidate generation, $n$-best reranking, minimum Bayes +risk (MBR) decoding, and self-debugging as the core components. We then study +the contributions of these components through execution-based evaluation +metrics. Our findings highlight the importance of execution-based methods and +the difference gap between execution-based and execution-free methods. +Furthermore, we assess the impact of filtering based on trial unit tests, a +simple and effective strategy that has been often overlooked in prior works. We +also propose self-debugging on multiple candidates, obtaining state-of-the-art +performance on reranking for code generation. We expect our framework to +provide a solid guideline for future research on code generation. + +
+
+ comment: 10 pages (32 including appendix), 5 figures, 25 tables. Prompts are + provided in the GitHub repository to avoid potential text overlap with other + papers +
+
+
+
+
+ + ♻ ☆ MoDeGPT: Modular Decomposition for Large Language Model Compression + + +
+ Large Language Models (LLMs) have reshaped the landscape of artificial +intelligence by demonstrating exceptional performance across various tasks. +However, substantial computational requirements make their deployment +challenging on devices with limited resources. Recently, compression methods +using low-rank matrix techniques have shown promise, yet these often lead to +degraded accuracy or introduce significant overhead in parameters and inference +latency. This paper introduces \textbf{Mo}dular \textbf{De}composition +(MoDeGPT), a novel structured compression framework that does not need recovery +fine-tuning while resolving the above drawbacks. MoDeGPT partitions the +Transformer block into modules comprised of matrix pairs and reduces the hidden +dimensions via reconstructing the module-level outputs. MoDeGPT is developed +based on a theoretical framework that utilizes three well-established matrix +decomposition algorithms -- Nystr\"om approximation, CR decomposition, and SVD +-- and applies them to our redefined transformer modules. Our comprehensive +experiments show MoDeGPT, without backward propagation, matches or surpasses +previous structured compression methods that rely on gradient information, and +saves 98% of compute costs on compressing a 13B model. On \textsc{Llama}-2/3 +and OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30% +compression rates. Moreover, the compression can be done on a single GPU within +a few hours and increases the inference throughput by up to 46%. + +
+
+ comment: 31 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ RT-Surv: Improving Mortality Prediction After Radiotherapy with Large + Language Model Structuring of Large-Scale Unstructured Electronic Health + Records + + +
+ Accurate patient selection is critical in radiotherapy (RT) to prevent +ineffective treatments. Traditional survival prediction models, relying on +structured data, often lack precision. This study explores the potential of +large language models (LLMs) to structure unstructured electronic health record +(EHR) data, thereby improving survival prediction accuracy through +comprehensive clinical information integration. Data from 34,276 patients +treated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed, +encompassing both structured and unstructured data. An open-source LLM was used +to structure the unstructured EHR data via single-shot learning, with its +performance compared against a domain-specific medical LLM and a smaller +variant. Survival prediction models were developed using statistical, machine +learning, and deep learning approaches, incorporating both structured and +LLM-structured data. Clinical experts evaluated the accuracy of the +LLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring +unstructured EHR data without additional training, significantly outperforming +the domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs +were more effective, particularly in extracting clinically relevant features +like general condition and disease extent, which closely correlated with +patient survival. Incorporating LLM-structured clinical features into survival +prediction models significantly improved accuracy, with the C-index of deep +learning models increasing from 0.737 to 0.820. These models also became more +interpretable by emphasizing clinically significant factors. This study shows +that general-domain LLMs, even without specific medical training, can +effectively structure large-scale unstructured EHR data, substantially +enhancing the accuracy and interpretability of clinical predictive models. + +
+
+ comment: 23 pages, 2 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ TeXBLEU: Automatic Metric for Evaluate LaTeX Format + + +
+ LaTeX is suitable for creating specially formatted documents in science, +technology, mathematics, and computer science. Although the use of mathematical +expressions in LaTeX format along with language models is increasing, there are +no proper evaluation matrices to evaluate them. In this study, we propose +TeXBLEU, a metric for evaluating mathematical expressions in the LaTeX format +built on the n-gram-based BLEU metric widely used in translation tasks. The +proposed TeXBLEU consists of a predefined tokenizer trained on the arXiv paper +dataset and a fine-tuned embedding model with positional encoding. The TeXBLEU +score was calculated by replacing BLUE's modified precision score with the +similarity of n-gram-based tokens. TeXBLEU showed improvements of 86\%, 121\%, +and 610\% over traditional evaluation metrics, such as BLEU, sacreBLEU, and +Rouge, respectively, on the MathBridge dataset with 1,000 data points. The code +is available at https://github.com/KyuDan1/TeXBLEU. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Truth-Aware Context Selection: Mitigating Hallucinations of Large + Language Models Being Misled by Untruthful Contexts ACL 2024 + + +
+ Although Large Language Models (LLMs) have demonstrated impressive text +generation capabilities, they are easily misled by untruthful contexts provided +by users or knowledge augmentation tools, leading to hallucinations. To +alleviate LLMs from being misled by untruthful context and take advantage of +knowledge augmentation, we propose Truth-Aware Context Selection (TACS), a +lightweight method to adaptively recognize and mask untruthful context from the +inputs. TACS begins by performing truth detection on the input context, +leveraging the parameterized knowledge within the LLM. Subsequently, it +constructs a corresponding attention mask based on the truthfulness of each +position, selecting the truthful context and discarding the untruthful context. +Additionally, we introduce a new evaluation metric, Disturbance Adaption Rate, +to further study the LLMs' ability to accept truthful information and resist +untruthful information. Experimental results indicate that TACS can effectively +filter untruthful context and significantly improve the overall quality of +LLMs' responses when presented with misleading information. + +
+
+ comment: Accepted to ACL 2024 Findings. Code is available at: + https://github.com/ictnlp/TACS +
+
+
+
+
+ + ♻ ☆ Patch-Level Training for Large Language Models + + +
+ As Large Language Models (LLMs) achieve remarkable progress in language +understanding and generation, their training efficiency has become a critical +concern. Traditionally, LLMs are trained to predict the next token in a +sequence. Despite the success of token-level training, it suffers from +considerable computational costs due to the need to process an extensive number +of tokens. To mitigate this issue, this paper introduces patch-level training +for LLMs, which reduces the sequence length by compressing multiple tokens into +a single patch. During patch-level training, we feed the language model shorter +sequences of patches and train it to predict the next patch, thereby processing +the majority of the training data at a significantly reduced computational +cost. Following this, the model continues token-level training on the remaining +training data to align with the inference mode. Experiments on a diverse range +of models (370M-2.7B parameters) demonstrate that patch-level training can +reduce overall computational costs to 0.5$\times$, without compromising the +model performance compared to token-level training. Source code: +\url{https://github.com/shaochenze/PatchTrain}. + +
+
+
+
+
+ + ♻ ☆ IndoCulture: Exploring Geographically-Influenced Cultural Commonsense + Reasoning Across Eleven Indonesian Provinces ACL + + +
+ Although commonsense reasoning is greatly shaped by cultural and geographical +factors, previous studies have predominantly centered on cultures grounded in +the English language, potentially resulting in an Anglocentric bias. In this +paper, we introduce IndoCulture, aimed at understanding the influence of +geographical factors on language model reasoning ability, with a specific +emphasis on the diverse cultures found within eleven Indonesian provinces. In +contrast to prior work that has relied on templates (Yin et al., 2022) and +online scrapping (Fung et al., 2024), we create IndoCulture by asking local +people to manually develop a cultural context and plausible options, across a +set of predefined topics. Evaluation of 27 language models reveals several +insights: (1) the open-weight Llama-3 is competitive with GPT-4, while other +open-weight models struggle, with accuracies below 50%; (2) there is a general +pattern of models generally performing better for some provinces, such as Bali +and West Java, and less well for others; and (3) the inclusion of location +context enhances performance, especially for larger models like GPT-4, +emphasizing the significance of geographical context in commonsense reasoning. + +
+
+ comment: Accepted at TACL +
+
+
+
+
+ + ♻ ☆ Enhancing Temporal Understanding in Audio Question Answering for Large + Audio Language Models + + +
+ The Audio Question Answering task includes audio event classification, audio +captioning, and open ended reasoning. Recently, Audio Question Answering has +garnered attention due to the advent of Large Audio Language Models. Current +literature focuses on constructing LALMs by integrating audio encoders with +text only Large Language Models through a projection module. While Large Audio +Language Models excel in general audio understanding, they are limited in +temporal reasoning which may hinder their commercial applications and on device +deployment. This paper addresses these challenges and limitations in audio +temporal reasoning. First, we introduce a data augmentation technique for +generating reliable audio temporal questions and answers using an LLM. Second, +we propose a continued finetuning curriculum learning strategy to specialize in +temporal reasoning without compromising performance on finetuned tasks. +Finally, we develop a reliable and transparent automated metric, assisted by an +LLM, to measure the correlation between Large Audio Language Model responses +and ground truth data intelligently. We demonstrate the effectiveness of our +proposed techniques using SOTA LALMs on public audio benchmark datasets. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Crossmodal ASR Error Correction with Discrete Speech Units + + +
+ ASR remains unsatisfactory in scenarios where the speaking style diverges +from that used to train ASR systems, resulting in erroneous transcripts. To +address this, ASR Error Correction (AEC), a post-ASR processing approach, is +required. In this work, we tackle an understudied issue: the Low-Resource +Out-of-Domain (LROOD) problem, by investigating crossmodal AEC on very limited +downstream data with 1-best hypothesis transcription. We explore pre-training +and fine-tuning strategies and uncover an ASR domain discrepancy phenomenon, +shedding light on appropriate training schemes for LROOD data. Moreover, we +propose the incorporation of discrete speech units to align with and enhance +the word embeddings for improving AEC quality. Results from multiple corpora +and several evaluation metrics demonstrate the feasibility and efficacy of our +proposed AEC approach on LROOD data as well as its generalizability and +superiority on large-scale data. Finally, a study on speech emotion recognition +confirms that our model produces ASR error-robust transcripts suitable for +downstream applications. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ♻ ☆ Speech Emotion Recognition with ASR Transcripts: A Comprehensive Study + on Word Error Rate and Fusion Techniques + + +
+ Text data is commonly utilized as a primary input to enhance Speech Emotion +Recognition (SER) performance and reliability. However, the reliance on +human-transcribed text in most studies impedes the development of practical SER +systems, creating a gap between in-lab research and real-world scenarios where +Automatic Speech Recognition (ASR) serves as the text source. Hence, this study +benchmarks SER performance using ASR transcripts with varying Word Error Rates +(WERs) from eleven models on three well-known corpora: IEMOCAP, CMU-MOSI, and +MSP-Podcast. Our evaluation includes both text-only and bimodal SER with six +fusion techniques, aiming for a comprehensive analysis that uncovers novel +findings and challenges faced by current SER research. Additionally, we propose +a unified ASR error-robust framework integrating ASR error correction and +modality-gated fusion, achieving lower WER and higher SER results compared to +the best-performing ASR transcript. These findings provide insights into SER +with ASR assistance, especially for real-world applications. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ♻ ☆ Explaining Large Language Models Decisions with Shapley Values + + +
+ The emergence of large language models (LLMs) has opened up exciting +possibilities for simulating human behavior and cognitive processes, with +potential applications in various domains, including marketing research and +consumer behavior analysis. However, the validity of utilizing LLMs as +stand-ins for human subjects remains uncertain due to glaring divergences that +suggest fundamentally different underlying processes at play and the +sensitivity of LLM responses to prompt variations. This paper presents a novel +approach based on Shapley values from cooperative game theory to interpret LLM +behavior and quantify the relative contribution of each prompt component to the +model's output. Through two applications - a discrete choice experiment and an +investigation of cognitive biases - we demonstrate how the Shapley value method +can uncover what we term "token noise" effects, a phenomenon where LLM +decisions are disproportionately influenced by tokens providing minimal +informative content. This phenomenon raises concerns about the robustness and +generalizability of insights obtained from LLMs in the context of human +behavior simulation. Our model-agnostic approach extends its utility to +proprietary LLMs, providing a valuable tool for practitioners and researchers +to strategically optimize prompts and mitigate apparent cognitive biases. Our +findings underscore the need for a more nuanced understanding of the factors +driving LLM responses before relying on them as substitutes for human subjects +in survey settings. We emphasize the importance of researchers reporting +results conditioned on specific prompt templates and exercising caution when +drawing parallels between human behavior and LLMs. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 98 + +
+
+
+ + ☆ An Efficient and Streaming Audio Visual Active Speaker Detection System + + +
+ This paper delves into the challenging task of Active Speaker Detection +(ASD), where the system needs to determine in real-time whether a person is +speaking or not in a series of video frames. While previous works have made +significant strides in improving network architectures and learning effective +representations for ASD, a critical gap exists in the exploration of real-time +system deployment. Existing models often suffer from high latency and memory +usage, rendering them impractical for immediate applications. To bridge this +gap, we present two scenarios that address the key challenges posed by +real-time constraints. First, we introduce a method to limit the number of +future context frames utilized by the ASD model. By doing so, we alleviate the +need for processing the entire sequence of future frames before a decision is +made, significantly reducing latency. Second, we propose a more stringent +constraint that limits the total number of past frames the model can access +during inference. This tackles the persistent memory issues associated with +running streaming ASD systems. Beyond these theoretical frameworks, we conduct +extensive experiments to validate our approach. Our results demonstrate that +constrained transformer models can achieve performance comparable to or even +better than state-of-the-art recurrent models, such as uni-directional GRUs, +with a significantly reduced number of context frames. Moreover, we shed light +on the temporal memory requirements of ASD systems, revealing that larger past +context has a more profound impact on accuracy than future context. When +profiling on a CPU we find that our efficient architecture is memory bound by +the amount of past context it can use and that the compute cost is negligible +as compared to the memory cost. + +
+
+
+
+
+ + ☆ Pushing the boundaries of event subsampling in event-based video + classification using CNNs + + +
+ Event cameras offer low-power visual sensing capabilities ideal for +edge-device applications. However, their high event rate, driven by high +temporal details, can be restrictive in terms of bandwidth and computational +resources. In edge AI applications, determining the minimum amount of events +for specific tasks can allow reducing the event rate to improve bandwidth, +memory, and processing efficiency. In this paper, we study the effect of event +subsampling on the accuracy of event data classification using convolutional +neural network (CNN) models. Surprisingly, across various datasets, the number +of events per video can be reduced by an order of magnitude with little drop in +accuracy, revealing the extent to which we can push the boundaries in accuracy +vs. event rate trade-off. Additionally, we also find that lower classification +accuracy in high subsampling rates is not solely attributable to information +loss due to the subsampling of the events, but that the training of CNNs can be +challenging in highly subsampled scenarios, where the sensitivity to +hyperparameters increases. We quantify training instability across multiple +event-based classification datasets using a novel metric for evaluating the +hyperparameter sensitivity of CNNs in different subsampling settings. Finally, +we analyze the weight gradients of the network to gain insight into this +instability. + +
+
+
+
+
+ + ☆ A Diffusion Approach to Radiance Field Relighting using + Multi-Illumination Synthesis + + +
+ Relighting radiance fields is severely underconstrained for multi-view data, +which is most often captured under a single illumination condition; It is +especially hard for full scenes containing multiple objects. We introduce a +method to create relightable radiance fields using such single-illumination +data by exploiting priors extracted from 2D image diffusion models. We first +fine-tune a 2D diffusion model on a multi-illumination dataset conditioned by +light direction, allowing us to augment a single-illumination capture into a +realistic -- but possibly inconsistent -- multi-illumination dataset from +directly defined light directions. We use this augmented data to create a +relightable radiance field represented by 3D Gaussian splats. To allow direct +control of light direction for low-frequency lighting, we represent appearance +with a multi-layer perceptron parameterized on light direction. To enforce +multi-view consistency and overcome inaccuracies we optimize a per-image +auxiliary feature vector. We show results on synthetic and real multi-view data +under single illumination, demonstrating that our method successfully exploits +2D diffusion model priors to allow realistic 3D relighting for complete scenes. +Project site +https://repo-sam.inria.fr/fungraph/generative-radiance-field-relighting/ + +
+
+ comment: Project site + https://repo-sam.inria.fr/fungraph/generative-radiance-field-relighting/ +
+
+
+
+
+ + ☆ Pushing Joint Image Denoising and Classification to the Edge ECCV 2024 + + +
+ In this paper, we jointly combine image classification and image denoising, +aiming to enhance human perception of noisy images captured by edge devices, +like low-light security cameras. In such settings, it is important to retain +the ability of humans to verify the automatic classification decision and thus +jointly denoise the image to enhance human perception. Since edge devices have +little computational power, we explicitly optimize for efficiency by proposing +a novel architecture that integrates the two tasks. Additionally, we alter a +Neural Architecture Search (NAS) method, which searches for classifiers to +search for the integrated model while optimizing for a target latency, +classification accuracy, and denoising performance. The NAS architectures +outperform our manually designed alternatives in both denoising and +classification, offering a significant improvement to human perception. Our +approach empowers users to construct architectures tailored to domains like +medical imaging, surveillance systems, and industrial inspections. + +
+
+ comment: Accepted paper at the ECCV 2024 workshop on Advances in Image + Manipulation (AIM) +
+
+
+
+
+ + ☆ ClearDepth: Enhanced Stereo Perception of Transparent Objects for + Robotic Manipulation + + +
+ Transparent object depth perception poses a challenge in everyday life and +logistics, primarily due to the inability of standard 3D sensors to accurately +capture depth on transparent or reflective surfaces. This limitation +significantly affects depth map and point cloud-reliant applications, +especially in robotic manipulation. We developed a vision transformer-based +algorithm for stereo depth recovery of transparent objects. This approach is +complemented by an innovative feature post-fusion module, which enhances the +accuracy of depth recovery by structural features in images. To address the +high costs associated with dataset collection for stereo camera-based +perception of transparent objects, our method incorporates a parameter-aligned, +domain-adaptive, and physically realistic Sim2Real simulation for efficient +data generation, accelerated by AI algorithm. Our experimental results +demonstrate the model's exceptional Sim2Real generalizability in real-world +scenarios, enabling precise depth mapping of transparent objects to assist in +robotic manipulation. Project details are available at +https://sites.google.com/view/cleardepth/ . + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Gaussian is All You Need: A Unified Framework for Solving Inverse + Problems via Diffusion Posterior Sampling + + +
+ Diffusion models can generate a variety of high-quality images by modeling +complex data distributions. Trained diffusion models can also be very effective +image priors for solving inverse problems. Most of the existing diffusion-based +methods integrate data consistency steps within the diffusion reverse sampling +process. The data consistency steps rely on an approximate likelihood function. +In this paper, we show that the existing approximations are either insufficient +or computationally inefficient. To address these issues, we propose a unified +likelihood approximation method that incorporates a covariance correction term +to enhance the performance and avoids propagating gradients through the +diffusion model. The correction term, when integrated into the reverse +diffusion sampling process, achieves better convergence towards the true data +posterior for selected distributions and improves performance on real-world +natural image datasets. Furthermore, we present an efficient way to factorize +and invert the covariance matrix of the likelihood function for several inverse +problems. We present comprehensive experiments to demonstrate the effectiveness +of our method over several existing approaches. + +
+
+
+
+
+ + ☆ D2-MLP: Dynamic Decomposed MLP Mixer for Medical Image Segmentation + + +
+ Convolutional neural networks are widely used in various segmentation tasks +in medical images. However, they are challenged to learn global features +adaptively due to the inherent locality of convolutional operations. In +contrast, MLP Mixers are proposed as a backbone to learn global information +across channels with low complexity. However, they cannot capture spatial +features efficiently. Additionally, they lack effective mechanisms to fuse and +mix features adaptively. To tackle these limitations, we propose a novel +Dynamic Decomposed Mixer module. It is designed to employ novel Mixers to +extract features and aggregate information across different spatial locations +and channels. Additionally, it employs novel dynamic mixing mechanisms to model +inter-dependencies between channel and spatial feature representations and to +fuse them adaptively. Subsequently, we incorporate it into a U-shaped +Transformer-based architecture to generate a novel network, termed the Dynamic +Decomposed MLP Mixer. We evaluated it for medical image segmentation on two +datasets, and it achieved superior segmentation performance than other +state-of-the-art methods. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ Visual Language Tracking with Multi-modal Interaction: A Robust + Benchmark + + +
+ Visual Language Tracking (VLT) enhances tracking by mitigating the +limitations of relying solely on the visual modality, utilizing high-level +semantic information through language. This integration of the language enables +more advanced human-machine interaction. The essence of interaction is +cognitive alignment, which typically requires multiple information exchanges, +especially in the sequential decision-making process of VLT. However, current +VLT benchmarks do not account for multi-round interactions during tracking. +They provide only an initial text and bounding box (bbox) in the first frame, +with no further interaction as tracking progresses, deviating from the original +motivation of the VLT task. To address these limitations, we propose a novel +and robust benchmark, VLT-MI (Visual Language Tracking with Multi-modal +Interaction), which introduces multi-round interaction into the VLT task for +the first time. (1) We generate diverse, multi-granularity texts for +multi-round, multi-modal interaction based on existing mainstream VLT +benchmarks using DTLLM-VLT, leveraging the world knowledge of LLMs. (2) We +propose a new VLT interaction paradigm that achieves multi-round interaction +through text updates and object recovery. When multiple tracking failures +occur, we provide the tracker with more aligned texts and corrected bboxes +through interaction, thereby expanding the scope of VLT downstream tasks. (3) +We conduct comparative experiments on both traditional VLT benchmarks and +VLT-MI, evaluating and analyzing the accuracy and robustness of trackers under +the interactive paradigm. This work offers new insights and paradigms for the +VLT task, enabling a fine-grained evaluation of multi-modal trackers. We +believe this approach can be extended to additional datasets in the future, +supporting broader evaluations and comparisons of video-language model +capabilities. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Interactive Masked Image Modeling for Multimodal Object Detection in + Remote Sensing + + +
+ Object detection in remote sensing imagery plays a vital role in various +Earth observation applications. However, unlike object detection in natural +scene images, this task is particularly challenging due to the abundance of +small, often barely visible objects across diverse terrains. To address these +challenges, multimodal learning can be used to integrate features from +different data modalities, thereby improving detection accuracy. Nonetheless, +the performance of multimodal learning is often constrained by the limited size +of labeled datasets. In this paper, we propose to use Masked Image Modeling +(MIM) as a pre-training technique, leveraging self-supervised learning on +unlabeled data to enhance detection performance. However, conventional MIM such +as MAE which uses masked tokens without any contextual information, struggles +to capture the fine-grained details due to a lack of interactions with other +parts of image. To address this, we propose a new interactive MIM method that +can establish interactions between different tokens, which is particularly +beneficial for object detection in remote sensing. The extensive ablation +studies and evluation demonstrate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Detect Fake with Fake: Leveraging Synthetic Data-driven Representation + for Synthetic Image Detection ECCV 2024 + + +
+ Are general-purpose visual representations acquired solely from synthetic +data useful for detecting fake images? In this work, we show the effectiveness +of synthetic data-driven representations for synthetic image detection. Upon +analysis, we find that vision transformers trained by the latest visual +representation learners with synthetic data can effectively distinguish fake +from real images without seeing any real images during pre-training. Notably, +using SynCLR as the backbone in a state-of-the-art detection method +demonstrates a performance improvement of +10.32 mAP and +4.73% accuracy over +the widely used CLIP, when tested on previously unseen GAN models. Code is +available at https://github.com/cvpaperchallenge/detect-fake-with-fake. + +
+
+ comment: Accepted to TWYN workshop at ECCV 2024 +
+
+
+
+
+ + ☆ InstantDrag: Improving Interactivity in Drag-based Image Editing SIGGRAPH + + +
+ Drag-based image editing has recently gained popularity for its interactivity +and precision. However, despite the ability of text-to-image models to generate +samples within a second, drag editing still lags behind due to the challenge of +accurately reflecting user interaction while maintaining image content. Some +existing approaches rely on computationally intensive per-image optimization or +intricate guidance-based methods, requiring additional inputs such as masks for +movable regions and text prompts, thereby compromising the interactivity of the +editing process. We introduce InstantDrag, an optimization-free pipeline that +enhances interactivity and speed, requiring only an image and a drag +instruction as input. InstantDrag consists of two carefully designed networks: +a drag-conditioned optical flow generator (FlowGen) and an optical +flow-conditioned diffusion model (FlowDiffusion). InstantDrag learns motion +dynamics for drag-based image editing in real-world video datasets by +decomposing the task into motion generation and motion-conditioned image +generation. We demonstrate InstantDrag's capability to perform fast, +photo-realistic edits without masks or text prompts through experiments on +facial video datasets and general scenes. These results highlight the +efficiency of our approach in handling drag-based image editing, making it a +promising solution for interactive, real-time applications. + +
+
+ comment: SIGGRAPH Asia 2024. Project webpage at + https://joonghyuk.com/instantdrag-web/ +
+
+
+
+
+ + ☆ DX2CT: Diffusion Model for 3D CT Reconstruction from Bi or Mono-planar + 2D X-ray(s) + + +
+ Computational tomography (CT) provides high-resolution medical imaging, but +it can expose patients to high radiation. X-ray scanners have low radiation +exposure, but their resolutions are low. This paper proposes a new conditional +diffusion model, DX2CT, that reconstructs three-dimensional (3D) CT volumes +from bi or mono-planar X-ray image(s). Proposed DX2CT consists of two key +components: 1) modulating feature maps extracted from two-dimensional (2D) +X-ray(s) with 3D positions of CT volume using a new transformer and 2) +effectively using the modulated 3D position-aware feature maps as conditions of +DX2CT. In particular, the proposed transformer can provide conditions with rich +information of a target CT slice to the conditional diffusion model, enabling +high-quality CT reconstruction. Our experiments with the bi or mono-planar +X-ray(s) benchmark datasets show that proposed DX2CT outperforms several +state-of-the-art methods. Our codes and model will be available at: +https://www.github.com/intyeger/DX2CT. + +
+
+
+
+
+ + ☆ Kinect Calibration and Data Optimization For Anthropometric Parameters + + +
+ Recently, through development of several 3d vision systems, widely used in +various applications, medical and biometric fields. Microsoft kinect sensor +have been most of used camera among 3d vision systems. Microsoft kinect sensor +can obtain depth images of a scene and 3d coordinates of human joints. Thus, +anthropometric features can extractable easily. Anthropometric feature and 3d +joint coordinate raw datas which captured from kinect sensor is unstable. The +strongest reason for this, datas vary by distance between joints of individual +and location of kinect sensor. Consequently, usage of this datas without kinect +calibration and data optimization does not result in sufficient and healthy. In +this study, proposed a novel method to calibrating kinect sensor and optimizing +skeleton features. Results indicate that the proposed method is quite effective +and worthy of further study in more general scenarios. + +
+
+
+
+
+ + ☆ Direct-CP: Directed Collaborative Perception for Connected and + Autonomous Vehicles via Proactive Attention + + +
+ Collaborative perception (CP) leverages visual data from connected and +autonomous vehicles (CAV) to enhance an ego vehicle's field of view (FoV). +Despite recent progress, current CP methods expand the ego vehicle's 360-degree +perceptual range almost equally, which faces two key challenges. Firstly, in +areas with uneven traffic distribution, focusing on directions with little +traffic offers limited benefits. Secondly, under limited communication budgets, +allocating excessive bandwidth to less critical directions lowers the +perception accuracy in more vital areas. To address these issues, we propose +Direct-CP, a proactive and direction-aware CP system aiming at improving CP in +specific directions. Our key idea is to enable an ego vehicle to proactively +signal its interested directions and readjust its attention to enhance local +directional CP performance. To achieve this, we first propose an RSU-aided +direction masking mechanism that assists an ego vehicle in identifying vital +directions. Additionally, we design a direction-aware selective attention +module to wisely aggregate pertinent features based on ego vehicle's +directional priorities, communication budget, and the positional data of CAVs. +Moreover, we introduce a direction-weighted detection loss (DWLoss) to capture +the divergence between directional CP outcomes and the ground truth, +facilitating effective model training. Extensive experiments on the V2X-Sim 2.0 +dataset demonstrate that our approach achieves 19.8\% higher local perception +accuracy in interested directions and 2.5\% higher overall perception accuracy +than the state-of-the-art methods in collaborative 3D object detection tasks. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Breaking reCAPTCHAv2 + + +
+ Our work examines the efficacy of employing advanced machine learning methods +to solve captchas from Google's reCAPTCHAv2 system. We evaluate the +effectiveness of automated systems in solving captchas by utilizing advanced +YOLO models for image segmentation and classification. Our main result is that +we can solve 100% of the captchas, while previous work only solved 68-71%. +Furthermore, our findings suggest that there is no significant difference in +the number of challenges humans and bots must solve to pass the captchas in +reCAPTCHAv2. This implies that current AI technologies can exploit advanced +image-based captchas. We also look under the hood of reCAPTCHAv2, and find +evidence that reCAPTCHAv2 is heavily based on cookie and browser history data +when evaluating whether a user is human or not. The code is provided alongside +this paper. + +
+
+ comment: 10 pages. Accepted at COMPSAC 2024 +
+
+
+
+
+ + ☆ Pathfinder for Low-altitude Aircraft with Binary Neural Network + + +
+ A prior global topological map (e.g., the OpenStreetMap, OSM) can boost the +performance of autonomous mapping by a ground mobile robot. However, the prior +map is usually incomplete due to lacking labeling in partial paths. To solve +this problem, this paper proposes an OSM maker using airborne sensors carried +by low-altitude aircraft, where the core of the OSM maker is a novel efficient +pathfinder approach based on LiDAR and camera data, i.e., a binary dual-stream +road segmentation model. Specifically, a multi-scale feature extraction based +on the UNet architecture is implemented for images and point clouds. To reduce +the effect caused by the sparsity of point cloud, an attention-guided gated +block is designed to integrate image and point-cloud features. For enhancing +the efficiency of the model, we propose a binarization streamline to each model +component, including a variant of vision transformer (ViT) architecture as the +encoder of the image branch, and new focal and perception losses to optimize +the model training. The experimental results on two datasets demonstrate that +our pathfinder method achieves SOTA accuracy with high efficiency in finding +paths from the low-level airborne sensors, and we can create complete OSM prior +maps based on the segmented road skeletons. Code and data are available +at:https://github.com/IMRL/Pathfinder}{https://github.com/IMRL/Pathfinder. + +
+
+
+
+
+ + ☆ Task-Specific Data Preparation for Deep Learning to Reconstruct + Structures of Interest from Severely Truncated CBCT Data + + +
+ Cone-beam computed tomography (CBCT) is widely used in interventional +surgeries and radiation oncology. Due to the limited size of flat-panel +detectors, anatomical structures might be missing outside the limited +field-of-view (FOV), which restricts the clinical applications of CBCT systems. +Recently, deep learning methods have been proposed to extend the FOV for +multi-slice CT systems. However, in mobile CBCT system with a smaller FOV size, +projection data is severely truncated and it is challenging for a network to +restore all missing structures outside the FOV. In some applications, only +certain structures outside the FOV are of interest, e.g., ribs in needle path +planning for liver/lung cancer diagnosis. Therefore, a task-specific data +preparation method is proposed in this work, which automatically let the +network focus on structures of interest instead of all the structures. Our +preliminary experiment shows that Pix2pixGAN with a conventional training has +the risk to reconstruct false positive and false negative rib structures from +severely truncated CBCT data, whereas Pix2pixGAN with the proposed +task-specific training can reconstruct all the ribs reliably. The proposed +method is promising to empower CBCT with more clinical applications. + +
+
+ comment: Published in the CT-Meeting 2024 proceeding. arXiv admin note: text + overlap with arXiv:2108.13844 +
+
+
+
+
+ + ☆ Contactless Fingerprint Recognition Using 3D Graph Matching + + +
+ Contactless fingerprint is a newly developed type of fingerprint, and has +gained lots of attention in recent fingerprint studies. However, most existing +contactless fingerprint algorithms treat contactless fingerprints as 2D plain +fingerprints, and utilize similar recognition methods as traditional +contact-based 2D fingerprints. This recognition approach does not consider the +modality difference between contactless and contact fingerprints, especially +the intrinsic 3D characteristic of contactless fingerprints. This paper +proposes a novel contactless fingerprint recognition algorithm that captures +the revealed 3D feature of contactless fingerprints rather than the plain 2D +feature. The proposed method first recovers 3D features from the input +contactless fingerprint, including the 3D shape model and 3D fingerprint +feature (minutiae, orientation, etc.). Then, a novel 3D graph matching is +conducted in 3D space according to the extracted 3D feature. Our method +captures the real 3D nature of contactless fingerprints as the whole feature +extraction and matching algorithms are completed in real 3D space. Experiments +results on contactless fingerprint databases show that the proposed method +successfully improves the matching accuracy of contactless fingerprints. +Exceptionally, our method performs stably across multiple poses of contactless +fingerprints due to 3D graph matching, which is a great advantage compared to +previous contactless fingerprint recognition algorithms. + +
+
+
+
+
+ + ☆ On the Computation of BD-Rate over a Set of Videos for Fair Assessment + of Performance of Learned Video Codecs ICASSP 2025 + + +
+ The Bj{\o}ntegaard Delta (BD) measure is widely employed to evaluate and +quantify the variations in the rate-distortion(RD) performance across different +codecs. Many researchers report the average BD value over multiple videos +within a dataset for different codecs. We claim that the current practice in +the learned video compression community of computing the average BD value over +a dataset based on the average RD curve of multiple videos can lead to +misleading conclusions. We show both by analysis of a simplistic case of linear +RD curves and experimental results with two recent learned video codecs that +averaging RD curves can lead to a single video to disproportionately influence +the average BD value especially when the operating bitrate range of different +codecs do not exactly match. Instead, we advocate for calculating the BD +measure per-video basis, as commonly done by the traditional video compression +community, followed by averaging the individual BD values over videos, to +provide a fair comparison of learned video codecs. Our experimental results +demonstrate that the comparison of two recent learned video codecs is affected +by how we evaluate the average BD measure. + +
+
+ comment: Submitted to IEEE ICASSP 2025 +
+
+
+
+
+ + ☆ Causal Transformer for Fusion and Pose Estimation in Deep Visual + Inertial Odometry ECCV 2024 + + +
+ In recent years, transformer-based architectures become the de facto standard +for sequence modeling in deep learning frameworks. Inspired by the successful +examples, we propose a causal visual-inertial fusion transformer (VIFT) for +pose estimation in deep visual-inertial odometry. This study aims to improve +pose estimation accuracy by leveraging the attention mechanisms in +transformers, which better utilize historical data compared to the recurrent +neural network (RNN) based methods seen in recent methods. Transformers +typically require large-scale data for training. To address this issue, we +utilize inductive biases for deep VIO networks. Since latent visual-inertial +feature vectors encompass essential information for pose estimation, we employ +transformers to refine pose estimates by updating latent vectors temporally. +Our study also examines the impact of data imbalance and rotation learning +methods in supervised end-to-end learning of visual inertial odometry by +utilizing specialized gradients in backpropagation for the elements of SE$(3)$ +group. The proposed method is end-to-end trainable and requires only a +monocular camera and IMU during inference. Experimental results demonstrate +that VIFT increases the accuracy of monocular VIO networks, achieving +state-of-the-art results when compared to previous methods on the KITTI +dataset. The code will be made available at https://github.com/ybkurt/VIFT. + +
+
+ comment: Accepted to ECCV 2024 2nd Workshop on Vision-Centric Autonomous + Driving (VCAD) +
+
+
+
+
+ + ☆ Uncertainty and Generalizability in Foundation Models for Earth + Observation + + +
+ We take the perspective in which we want to design a downstream task (such as +estimating vegetation coverage) on a certain area of interest (AOI) with a +limited labeling budget. By leveraging an existing Foundation Model (FM) we +must decide whether we train a downstream model on a different but label-rich +AOI hoping it generalizes to our AOI, or we split labels in our AOI for +training and validating. In either case, we face choices concerning what FM to +use, how to sample our AOI for labeling, etc. which affect both the performance +and uncertainty of the results. In this work, we perform a large ablative study +using eight existing FMs on either Sentinel 1 or Sentinel 2 as input data, and +the classes from the ESA World Cover product as downstream tasks across eleven +AOIs. We do repeated sampling and training, resulting in an ablation of some +500K simple linear regression models. Our results show both the limits of +spatial generalizability across AOIs and the power of FMs where we are able to +get over 0.9 correlation coefficient between predictions and targets on +different chip level predictive tasks. And still, performance and uncertainty +vary greatly across AOIs, tasks and FMs. We believe this is a key issue in +practice, because there are many design decisions behind each FM and downstream +task (input modalities, sampling, architectures, pretraining, etc.) and usually +a downstream task designer is aware of and can decide upon a few of them. +Through this work, we advocate for the usage of the methodology herein +described (large ablations on reference global labels and simple probes), both +when publishing new FMs, and to make informed decisions when designing +downstream tasks to use them. + +
+
+ comment: A large ablation study measuring uncertainty and spatial + generalizability with 8 foundation models, 11 world regions and 7 downstream + tasks +
+
+
+
+
+ + ☆ Layerwise Change of Knowledge in Neural Networks + + +
+ This paper aims to explain how a deep neural network (DNN) gradually extracts +new knowledge and forgets noisy features through layers in forward propagation. +Up to now, although the definition of knowledge encoded by the DNN has not +reached a consensus, Previous studies have derived a series of mathematical +evidence to take interactions as symbolic primitive inference patterns encoded +by a DNN. We extend the definition of interactions and, for the first time, +extract interactions encoded by intermediate layers. We quantify and track the +newly emerged interactions and the forgotten interactions in each layer during +the forward propagation, which shed new light on the learning behavior of DNNs. +The layer-wise change of interactions also reveals the change of the +generalization capacity and instability of feature representations of a DNN. + +
+
+
+
+
+ + ☆ Precision Aquaculture: An Integrated Computer Vision and IoT Approach + for Optimized Tilapia Feeding + + +
+ Traditional fish farming practices often lead to inefficient feeding, +resulting in environmental issues and reduced productivity. We developed an +innovative system combining computer vision and IoT technologies for precise +Tilapia feeding. Our solution uses real-time IoT sensors to monitor water +quality parameters and computer vision algorithms to analyze fish size and +count, determining optimal feed amounts. A mobile app enables remote monitoring +and control. We utilized YOLOv8 for keypoint detection to measure Tilapia +weight from length, achieving \textbf{94\%} precision on 3,500 annotated +images. Pixel-based measurements were converted to centimeters using depth +estimation for accurate feeding calculations. Our method, with data collection +mirroring inference conditions, significantly improved results. Preliminary +estimates suggest this approach could increase production up to 58 times +compared to traditional farms. Our models, code, and dataset are +open-source~\footnote{The code, dataset, and models are available upon +reasonable request. + +
+
+ comment: 8 pages, 6 figures, 3 tables, 21th International Conference on + Informatics in Control, Automation, and Robotics +
+
+
+
+
+ + ☆ Autoregressive Sequence Modeling for 3D Medical Image Representation + + +
+ Three-dimensional (3D) medical images, such as Computed Tomography (CT) and +Magnetic Resonance Imaging (MRI), are essential for clinical applications. +However, the need for diverse and comprehensive representations is particularly +pronounced when considering the variability across different organs, diagnostic +tasks, and imaging modalities. How to effectively interpret the intricate +contextual information and extract meaningful insights from these images +remains an open challenge to the community. While current self-supervised +learning methods have shown potential, they often consider an image as a whole +thereby overlooking the extensive, complex relationships among local regions +from one or multiple images. In this work, we introduce a pioneering method for +learning 3D medical image representations through an autoregressive +pre-training framework. Our approach sequences various 3D medical images based +on spatial, contrast, and semantic correlations, treating them as +interconnected visual tokens within a token sequence. By employing an +autoregressive sequence modeling task, we predict the next visual token in the +sequence, which allows our model to deeply understand and integrate the +contextual information inherent in 3D medical images. Additionally, we +implement a random startup strategy to avoid overestimating token relationships +and to enhance the robustness of learning. The effectiveness of our approach is +demonstrated by the superior performance over others on nine downstream tasks +in public datasets. + +
+
+
+
+
+ + ☆ GenMapping: Unleashing the Potential of Inverse Perspective Mapping for + Robust Online HD Map Construction + + +
+ Online High-Definition (HD) maps have emerged as the preferred option for +autonomous driving, overshadowing the counterpart offline HD maps due to +flexible update capability and lower maintenance costs. However, contemporary +online HD map models embed parameters of visual sensors into training, +resulting in a significant decrease in generalization performance when applied +to visual sensors with different parameters. Inspired by the inherent potential +of Inverse Perspective Mapping (IPM), where camera parameters are decoupled +from the training process, we have designed a universal map generation +framework, GenMapping. The framework is established with a triadic synergy +architecture, including principal and dual auxiliary branches. When faced with +a coarse road image with local distortion translated via IPM, the principal +branch learns robust global features under the state space models. The two +auxiliary branches are a dense perspective branch and a sparse prior branch. +The former exploits the correlation information between static and moving +objects, whereas the latter introduces the prior knowledge of OpenStreetMap +(OSM). The triple-enhanced merging module is crafted to synergistically +integrate the unique spatial features from all three branches. To further +improve generalization capabilities, a Cross-View Map Learning (CVML) scheme is +leveraged to realize joint learning within the common space. Additionally, a +Bidirectional Data Augmentation (BiDA) module is introduced to mitigate +reliance on datasets concurrently. A thorough array of experimental results +shows that the proposed model surpasses current state-of-the-art methods in +both semantic mapping and vectorized mapping, while also maintaining a rapid +inference speed. The source code will be publicly available at +https://github.com/lynn-yu/GenMapping. + +
+
+ comment: The source code will be publicly available at + https://github.com/lynn-yu/GenMapping +
+
+
+
+
+ + ☆ AdR-Gaussian: Accelerating Gaussian Splatting with Adaptive Radius SIGGRAPH + + +
+ 3D Gaussian Splatting (3DGS) is a recent explicit 3D representation that has +achieved high-quality reconstruction and real-time rendering of complex scenes. +However, the rasterization pipeline still suffers from unnecessary overhead +resulting from avoidable serial Gaussian culling, and uneven load due to the +distinct number of Gaussian to be rendered across pixels, which hinders wider +promotion and application of 3DGS. In order to accelerate Gaussian splatting, +we propose AdR-Gaussian, which moves part of serial culling in Render stage +into the earlier Preprocess stage to enable parallel culling, employing +adaptive radius to narrow the rendering pixel range for each Gaussian, and +introduces a load balancing method to minimize thread waiting time during the +pixel-parallel rendering. Our contributions are threefold, achieving a +rendering speed of 310% while maintaining equivalent or even better quality +than the state-of-the-art. Firstly, we propose to early cull Gaussian-Tile +pairs of low splatting opacity based on an adaptive radius in the +Gaussian-parallel Preprocess stage, which reduces the number of affected tile +through the Gaussian bounding circle, thus reducing unnecessary overhead and +achieving faster rendering speed. Secondly, we further propose early culling +based on axis-aligned bounding box for Gaussian splatting, which achieves a +more significant reduction in ineffective expenses by accurately calculating +the Gaussian size in the 2D directions. Thirdly, we propose a balancing +algorithm for pixel thread load, which compresses the information of heavy-load +pixels to reduce thread waiting time, and enhance information of light-load +pixels to hedge against rendering quality loss. Experiments on three datasets +demonstrate that our algorithm can significantly improve the Gaussian Splatting +rendering speed. + +
+
+ comment: SIGGRAPH Asia 2024 Conference Papers (SA Conference Papers '24), + December 03-06, 2024, Tokyo, Japan +
+
+
+
+
+ + ☆ Test-time Training for Hyperspectral Image Super-resolution + + +
+ The progress on Hyperspectral image (HSI) super-resolution (SR) is still +lagging behind the research of RGB image SR. HSIs usually have a high number of +spectral bands, so accurately modeling spectral band interaction for HSI SR is +hard. Also, training data for HSI SR is hard to obtain so the dataset is +usually rather small. In this work, we propose a new test-time training method +to tackle this problem. Specifically, a novel self-training framework is +developed, where more accurate pseudo-labels and more accurate LR-HR +relationships are generated so that the model can be further trained with them +to improve performance. In order to better support our test-time training +method, we also propose a new network architecture to learn HSI SR without +modeling spectral band interaction and propose a new data augmentation method +Spectral Mixup to increase the diversity of the training data at test time. We +also collect a new HSI dataset with a diverse set of images of interesting +objects ranging from food to vegetation, to materials, and to general scenes. +Extensive experiments on multiple datasets show that our method can improve the +performance of pre-trained models significantly after test-time training and +outperform competing methods significantly for HSI SR. + +
+
+ comment: Accepted to T-PAMI +
+
+
+
+
+ + ☆ SkinFormer: Learning Statistical Texture Representation with Transformer + for Skin Lesion Segmentation + + +
+ Accurate skin lesion segmentation from dermoscopic images is of great +importance for skin cancer diagnosis. However, automatic segmentation of +melanoma remains a challenging task because it is difficult to incorporate +useful texture representations into the learning process. Texture +representations are not only related to the local structural information +learned by CNN, but also include the global statistical texture information of +the input image. In this paper, we propose a trans\textbf{Former} network +(\textbf{SkinFormer}) that efficiently extracts and fuses statistical texture +representation for \textbf{Skin} lesion segmentation. Specifically, to quantify +the statistical texture of input features, a Kurtosis-guided Statistical +Counting Operator is designed. We propose Statistical Texture Fusion +Transformer and Statistical Texture Enhance Transformer with the help of +Kurtosis-guided Statistical Counting Operator by utilizing the transformer's +global attention mechanism. The former fuses structural texture information and +statistical texture information, and the latter enhances the statistical +texture of multi-scale features. {Extensive experiments on three publicly +available skin lesion datasets validate that our SkinFormer outperforms other +SOAT methods, and our method achieves 93.2\% Dice score on ISIC 2018. It can be +easy to extend SkinFormer to segment 3D images in the future.} Our code is +available at https://github.com/Rongtao-Xu/SkinFormer. + +
+
+ comment: 12 pages, 8 figures, published to JBHI +
+
+
+
+
+ + ☆ Joint image reconstruction and segmentation of real-time cardiac MRI in + free-breathing using a model based on disentangled representation learning + + +
+ A joint image reconstruction and segmentation approach based on disentangled +representation learning was trained to enable cardiac cine MR imaging in +real-time and under free-breathing. An exploratory feasibility study tested the +proposed method in undersampled real-time acquisitions based on an in-house +developed spiral bSSFP pulse sequence in eight healthy participants and five +patients with intermittent atrial fibrillation. Images and predicted LV +segmentations were compared to the reference standard of ECG-gated segmented +Cartesian cine in repeated breath-holds and corresponding manual segmentation. +On a 5-point Likert scale, image quality of the real-time breath-hold approach +and Cartesian cine was comparable in healthy participants (RT-BH: 1.99 $\pm$ +.98, Cartesian: 1.94 $\pm$ .86, p=.052), but slightly inferior in +free-breathing (RT-FB: 2.40 $\pm$ .98, p<.001). In patients with arrhythmia, +image quality from both real-time approaches was favourable (RT-BH: 2.10 $\pm$ +1.28, p<.001, RT-FB: 2.40 $\pm$ 1.13, p<.001, Cartesian: 2.68 $\pm$ 1.13). +Intra-observer reliability was good (ICC=.77, 95%-confidence interval [.75, +.79], p<.001). In functional analysis, a positive bias was observed for +ejection fractions derived from the proposed model compared to the clinical +reference standard (RT-BH mean EF: 58.5 $\pm$ 5.6%, bias: +3.47%, +95%-confidence interval [-.86, 7.79%], RT-FB mean: 57.9 $\pm$ 10.6%, bias: ++1.45%, [-3.02, 5.91%], Cartesian mean: 54.9 $\pm$ 6.7%). The introduced +real-time MR imaging technique is capable of acquiring high-quality cardiac +cine data in 1-2 minutes without the need for ECG gating and breath-holds. It +thus offers a promising alternative to the current clinical practice of +segmented acquisition, with shorter scan times, higher patient comfort and +increased robustness to arrhythmia and patient incompliance. + +
+
+ comment: Submitted to the Journal of Cardiovascular Magnetic Resonance +
+
+
+
+
+ + ☆ TapToTab : Video-Based Guitar Tabs Generation using AI and Audio + Analysis + + +
+ The automation of guitar tablature generation from video inputs holds +significant promise for enhancing music education, transcription accuracy, and +performance analysis. Existing methods face challenges with consistency and +completeness, particularly in detecting fretboards and accurately identifying +notes. To address these issues, this paper introduces an advanced approach +leveraging deep learning, specifically YOLO models for real-time fretboard +detection, and Fourier Transform-based audio analysis for precise note +identification. Experimental results demonstrate substantial improvements in +detection accuracy and robustness compared to traditional techniques. This +paper outlines the development, implementation, and evaluation of these +methodologies, aiming to revolutionize guitar instruction by automating the +creation of guitar tabs from video recordings. + +
+
+
+
+
+ + ☆ Dense Point Clouds Matter: Dust-GS for Scene Reconstruction from Sparse + Viewpoints + + +
+ 3D Gaussian Splatting (3DGS) has demonstrated remarkable performance in scene +synthesis and novel view synthesis tasks. Typically, the initialization of 3D +Gaussian primitives relies on point clouds derived from Structure-from-Motion +(SfM) methods. However, in scenarios requiring scene reconstruction from sparse +viewpoints, the effectiveness of 3DGS is significantly constrained by the +quality of these initial point clouds and the limited number of input images. +In this study, we present Dust-GS, a novel framework specifically designed to +overcome the limitations of 3DGS in sparse viewpoint conditions. Instead of +relying solely on SfM, Dust-GS introduces an innovative point cloud +initialization technique that remains effective even with sparse input data. +Our approach leverages a hybrid strategy that integrates an adaptive +depth-based masking technique, thereby enhancing the accuracy and detail of +reconstructed scenes. Extensive experiments conducted on several benchmark +datasets demonstrate that Dust-GS surpasses traditional 3DGS methods in +scenarios with sparse viewpoints, achieving superior scene reconstruction +quality with a reduced number of input images. + +
+
+
+
+
+ + ☆ Knowledge-Enhanced Facial Expression Recognition with + Emotional-to-Neutral Transformation + + +
+ Existing facial expression recognition (FER) methods typically fine-tune a +pre-trained visual encoder using discrete labels. However, this form of +supervision limits to specify the emotional concept of different facial +expressions. In this paper, we observe that the rich knowledge in text +embeddings, generated by vision-language models, is a promising alternative for +learning discriminative facial expression representations. Inspired by this, we +propose a novel knowledge-enhanced FER method with an emotional-to-neutral +transformation. Specifically, we formulate the FER problem as a process to +match the similarity between a facial expression representation and text +embeddings. Then, we transform the facial expression representation to a +neutral representation by simulating the difference in text embeddings from +textual facial expression to textual neutral. Finally, a self-contrast +objective is introduced to pull the facial expression representation closer to +the textual facial expression, while pushing it farther from the neutral +representation. We conduct evaluation with diverse pre-trained visual encoders +including ResNet-18 and Swin-T on four challenging facial expression datasets. +Extensive experiments demonstrate that our method significantly outperforms +state-of-the-art FER methods. The code will be publicly available. + +
+
+
+
+
+ + ☆ Improved Unet model for brain tumor image segmentation based on + ASPP-coordinate attention mechanism + + +
+ In this paper, we propose an improved Unet model for brain tumor image +segmentation, which combines coordinate attention mechanism and ASPP module to +improve the segmentation effect. After the data set is divided, we do the +necessary preprocessing to the image and use the improved model to experiment. +First, we trained and validated the traditional Unet model. By analyzing the +loss curve of the training set and the validation set, we can see that the loss +value continues to decline at the first epoch and becomes stable at the eighth +epoch. This process shows that the model constantly optimizes its parameters to +improve performance. At the same time, the change in the miou (mean +Intersection over Union) index shows that the miou value exceeded 0.6 at the +15th epoch, remained above 0.6 thereafter, and reached above 0.7 at the 46th +epoch. These results indicate that the basic Unet model is effective in brain +tumor image segmentation. Next, we introduce an improved Unet algorithm based +on coordinate attention mechanism and ASPP module for experiments. By observing +the loss change curves of the training set and the verification set, it is +found that the loss value reaches the lowest point at the sixth epoch and then +remains relatively stable. At the same time, the miou indicator has stabilized +above 0.7 since the 20th epoch and has reached a maximum of 0.76. These results +show that the new mechanism introduced significantly improves the segmentation +ability of the model. Finally, we apply the trained traditional Unet model and +the improved Unet model based on the coordinate attention mechanism and ASPP +module to the test set for brain tumor image segmentation prediction. Compared +to the traditional Unet, the enhanced model offers superior segmentation and +edge accuracy, providing a more reliable method for medical image analysis with +the coordinate attention mechanism and ASPP module. + +
+
+ comment: 5 pages, 8 figures, accepted by ICBASE 2024 +
+
+
+
+
+ + ☆ Optimizing 4D Lookup Table for Low-light Video Enhancement via Wavelet + Priori + + +
+ Low-light video enhancement is highly demanding in maintaining spatiotemporal +color consistency. Therefore, improving the accuracy of color mapping and +keeping the latency low is challenging. Based on this, we propose incorporating +Wavelet-priori for 4D Lookup Table (WaveLUT), which effectively enhances the +color coherence between video frames and the accuracy of color mapping while +maintaining low latency. Specifically, we use the wavelet low-frequency domain +to construct an optimized lookup prior and achieve an adaptive enhancement +effect through a designed Wavelet-prior 4D lookup table. To effectively +compensate the a priori loss in the low light region, we further explore a +dynamic fusion strategy that adaptively determines the spatial weights based on +the correlation between the wavelet lighting prior and the target intensity +structure. In addition, during the training phase, we devise a text-driven +appearance reconstruction method that dynamically balances brightness and +content through multimodal semantics-driven Fourier spectra. Extensive +experiments on a wide range of benchmark datasets show that this method +effectively enhances the previous method's ability to perceive the color space +and achieves metric-favorable and perceptually oriented real-time enhancement +while maintaining high efficiency. + +
+
+
+
+
+ + ☆ ChangeChat: An Interactive Model for Remote Sensing Change Analysis via + Multimodal Instruction Tuning + + +
+ Remote sensing (RS) change analysis is vital for monitoring Earth's dynamic +processes by detecting alterations in images over time. Traditional change +detection excels at identifying pixel-level changes but lacks the ability to +contextualize these alterations. While recent advancements in change captioning +offer natural language descriptions of changes, they do not support +interactive, user-specific queries. To address these limitations, we introduce +ChangeChat, the first bitemporal vision-language model (VLM) designed +specifically for RS change analysis. ChangeChat utilizes multimodal instruction +tuning, allowing it to handle complex queries such as change captioning, +category-specific quantification, and change localization. To enhance the +model's performance, we developed the ChangeChat-87k dataset, which was +generated using a combination of rule-based methods and GPT-assisted +techniques. Experiments show that ChangeChat offers a comprehensive, +interactive solution for RS change analysis, achieving performance comparable +to or even better than state-of-the-art (SOTA) methods on specific tasks, and +significantly surpassing the latest general-domain model, GPT-4. Code and +pre-trained weights are available at https://github.com/hanlinwu/ChangeChat. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ HTR-VT: Handwritten Text Recognition with Vision Transformer + + +
+ We explore the application of Vision Transformer (ViT) for handwritten text +recognition. The limited availability of labeled data in this domain poses +challenges for achieving high performance solely relying on ViT. Previous +transformer-based models required external data or extensive pre-training on +large datasets to excel. To address this limitation, we introduce a +data-efficient ViT method that uses only the encoder of the standard +transformer. We find that incorporating a Convolutional Neural Network (CNN) +for feature extraction instead of the original patch embedding and employ +Sharpness-Aware Minimization (SAM) optimizer to ensure that the model can +converge towards flatter minima and yield notable enhancements. Furthermore, +our introduction of the span mask technique, which masks interconnected +features in the feature map, acts as an effective regularizer. Empirically, our +approach competes favorably with traditional CNN-based models on small datasets +like IAM and READ2016. Additionally, it establishes a new benchmark on the LAM +dataset, currently the largest dataset with 19,830 training text lines. The +code is publicly available at: https://github.com/YutingLi0606/HTR-VT. + +
+
+ comment: Accepted to Pattern Recognition +
+
+
+
+
+ + ☆ DiffFAS: Face Anti-Spoofing via Generative Diffusion Models ECCV 24 + + +
+ Face anti-spoofing (FAS) plays a vital role in preventing face recognition +(FR) systems from presentation attacks. Nowadays, FAS systems face the +challenge of domain shift, impacting the generalization performance of existing +FAS methods. In this paper, we rethink about the inherence of domain shift and +deconstruct it into two factors: image style and image quality. Quality +influences the purity of the presentation of spoof information, while style +affects the manner in which spoof information is presented. Based on our +analysis, we propose DiffFAS framework, which quantifies quality as prior +information input into the network to counter image quality shift, and performs +diffusion-based high-fidelity cross-domain and cross-attack types generation to +counter image style shift. DiffFAS transforms easily collectible live faces +into high-fidelity attack faces with precise labels while maintaining +consistency between live and spoof face identities, which can also alleviate +the scarcity of labeled data with novel type attacks faced by nowadays FAS +system. We demonstrate the effectiveness of our framework on challenging +cross-domain and cross-attack FAS datasets, achieving the state-of-the-art +performance. Available at https://github.com/murphytju/DiffFAS. + +
+
+ comment: ECCV 24 +
+
+
+
+
+ + ☆ Hybrid-TTA: Continual Test-time Adaptation via Dynamic Domain Shift + Detection + + +
+ Continual Test Time Adaptation (CTTA) has emerged as a critical approach for +bridging the domain gap between the controlled training environments and the +real-world scenarios, enhancing model adaptability and robustness. Existing +CTTA methods, typically categorized into Full-Tuning (FT) and Efficient-Tuning +(ET), struggle with effectively addressing domain shifts. To overcome these +challenges, we propose Hybrid-TTA, a holistic approach that dynamically selects +instance-wise tuning method for optimal adaptation. Our approach introduces the +Dynamic Domain Shift Detection (DDSD) strategy, which identifies domain shifts +by leveraging temporal correlations in input sequences and dynamically switches +between FT and ET to adapt to varying domain shifts effectively. Additionally, +the Masked Image Modeling based Adaptation (MIMA) framework is integrated to +ensure domain-agnostic robustness with minimal computational overhead. Our +Hybrid-TTA achieves a notable 1.6%p improvement in mIoU on the +Cityscapes-to-ACDC benchmark dataset, surpassing previous state-of-the-art +methods and offering a robust solution for real-world continual adaptation +challenges. + +
+
+
+
+
+ + ☆ Second-order difference subspace + + +
+ Subspace representation is a fundamental technique in various fields of +machine learning. Analyzing a geometrical relationship among multiple subspaces +is essential for understanding subspace series' temporal and/or spatial +dynamics. This paper proposes the second-order difference subspace, a +higher-order extension of the first-order difference subspace between two +subspaces that can analyze the geometrical difference between them. As a +preliminary for that, we extend the definition of the first-order difference +subspace to the more general setting that two subspaces with different +dimensions have an intersection. We then define the second-order difference +subspace by combining the concept of first-order difference subspace and +principal component subspace (Karcher mean) between two subspaces, motivated by +the second-order central difference method. We can understand that the +first/second-order difference subspaces correspond to the velocity and +acceleration of subspace dynamics from the viewpoint of a geodesic on a +Grassmann manifold. We demonstrate the validity and naturalness of our +second-order difference subspace by showing numerical results on two +applications: temporal shape analysis of a 3D object and time series analysis +of a biometric signal. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ CSS: Overcoming Pose and Scene Challenges in Crowd-Sourced 3D Gaussian + Splatting + + +
+ We introduce Crowd-Sourced Splatting (CSS), a novel 3D Gaussian Splatting +(3DGS) pipeline designed to overcome the challenges of pose-free scene +reconstruction using crowd-sourced imagery. The dream of reconstructing +historically significant but inaccessible scenes from collections of +photographs has long captivated researchers. However, traditional 3D techniques +struggle with missing camera poses, limited viewpoints, and inconsistent +lighting. CSS addresses these challenges through robust geometric priors and +advanced illumination modeling, enabling high-quality novel view synthesis +under complex, real-world conditions. Our method demonstrates clear +improvements over existing approaches, paving the way for more accurate and +flexible applications in AR, VR, and large-scale 3D reconstruction. + +
+
+
+
+
+ + ☆ DICS: Find Domain-Invariant and Class-Specific Features for + Out-of-Distribution Generalization + + +
+ While deep neural networks have made remarkable progress in various vision +tasks, their performance typically deteriorates when tested in +out-of-distribution (OOD) scenarios. Many OOD methods focus on extracting +domain-invariant features but neglect whether these features are unique to each +class. Even if some features are domain-invariant, they cannot serve as key +classification criteria if shared across different classes. In OOD tasks, both +domain-related and class-shared features act as confounders that hinder +generalization. In this paper, we propose a DICS model to extract +Domain-Invariant and Class-Specific features, including Domain Invariance +Testing (DIT) and Class Specificity Testing (CST), which mitigate the effects +of spurious correlations introduced by confounders. DIT learns domain-related +features of each source domain and removes them from inputs to isolate +domain-invariant class-related features. DIT ensures domain invariance by +aligning same-class features across different domains. Then, CST calculates +soft labels for those features by comparing them with features learned in +previous steps. We optimize the cross-entropy between the soft labels and their +true labels, which enhances same-class similarity and different-class +distinctiveness, thereby reinforcing class specificity. Extensive experiments +on widely-used benchmarks demonstrate the effectiveness of our proposed +algorithm. Additional visualizations further demonstrate that DICS effectively +identifies the key features of each class in target domains. + +
+
+
+
+
+ + ☆ SRE-CNN: A Spatiotemporal Rotation-Equivariant CNN for Cardiac Cine MR + Imaging MICCAI 2024 + + +
+ Dynamic MR images possess various transformation symmetries,including the +rotation symmetry of local features within the image and along the temporal +dimension. Utilizing these symmetries as prior knowledge can facilitate dynamic +MR imaging with high spatiotemporal resolution. Equivariant CNN is an effective +tool to leverage the symmetry priors. However, current equivariant CNN methods +fail to fully exploit these symmetry priors in dynamic MR imaging. In this +work, we propose a novel framework of Spatiotemporal Rotation-Equivariant CNN +(SRE-CNN), spanning from the underlying high-precision filter design to the +construction of the temporal-equivariant convolutional module and imaging +model, to fully harness the rotation symmetries inherent in dynamic MR images. +The temporal-equivariant convolutional module enables exploitation the rotation +symmetries in both spatial and temporal dimensions, while the high-precision +convolutional filter, based on parametrization strategy, enhances the +utilization of rotation symmetry of local features to improve the +reconstruction of detailed anatomical structures. Experiments conducted on +highly undersampled dynamic cardiac cine data (up to 20X) have demonstrated the +superior performance of our proposed approach, both quantitatively and +qualitatively. + +
+
+ comment: Accepted at MICCAI 2024 +
+
+
+
+
+ + ☆ GroundingBooth: Grounding Text-to-Image Customization + + +
+ Recent studies in text-to-image customization show great success in +generating personalized object variants given several images of a subject. +While existing methods focus more on preserving the identity of the subject, +they often fall short of controlling the spatial relationship between objects. +In this work, we introduce GroundingBooth, a framework that achieves zero-shot +instance-level spatial grounding on both foreground subjects and background +objects in the text-to-image customization task. Our proposed text-image +grounding module and masked cross-attention layer allow us to generate +personalized images with both accurate layout alignment and identity +preservation while maintaining text-image coherence. With such layout control, +our model inherently enables the customization of multiple subjects at once. +Our model is evaluated on both layout-guided image synthesis and +reference-based customization tasks, showing strong results compared to +existing methods. Our work is the first work to achieve a joint grounding of +both subject-driven foreground generation and text-driven background +generation. + +
+
+
+
+
+ + ☆ Anytime Continual Learning for Open Vocabulary Classification ECCV 2024 + + +
+ We propose an approach for anytime continual learning (AnytimeCL) for open +vocabulary image classification. The AnytimeCL problem aims to break away from +batch training and rigid models by requiring that a system can predict any set +of labels at any time and efficiently update and improve when receiving one or +more training samples at any time. Despite the challenging goal, we achieve +substantial improvements over recent methods. We propose a dynamic weighting +between predictions of a partially fine-tuned model and a fixed open vocabulary +model that enables continual improvement when training samples are available +for a subset of a task's labels. We also propose an attention-weighted PCA +compression of training features that reduces storage and computation with +little impact to model accuracy. Our methods are validated with experiments +that test flexibility of learning and inference. Code is available at +https://github.com/jessemelpolio/AnytimeCL. + +
+
+ comment: To appear at ECCV 2024 as Oral presentation +
+
+
+
+
+ + ☆ AWF: Adaptive Weight Fusion for Enhanced Class Incremental Semantic + Segmentation + + +
+ Class Incremental Semantic Segmentation (CISS) aims to mitigate catastrophic +forgetting by maintaining a balance between previously learned and newly +introduced knowledge. Existing methods, primarily based on regularization +techniques like knowledge distillation, help preserve old knowledge but often +face challenges in effectively integrating new knowledge, resulting in limited +overall improvement. Endpoints Weight Fusion (EWF) method, while simple, +effectively addresses some of these limitations by dynamically fusing the model +weights from previous steps with those from the current step, using a fusion +parameter alpha determined by the relative number of previously known classes +and newly introduced classes. However, the simplicity of the alpha calculation +may limit its ability to fully capture the complexities of different task +scenarios, potentially leading to suboptimal fusion outcomes. In this paper, we +propose an enhanced approach called Adaptive Weight Fusion (AWF), which +introduces an alternating training strategy for the fusion parameter, allowing +for more flexible and adaptive weight integration. AWF achieves superior +performance by better balancing the retention of old knowledge with the +learning of new classes, significantly improving results on benchmark CISS +tasks compared to the original EWF. And our experiment code will be released on +Github. + +
+
+ comment: 10 pages,6 figures +
+
+
+
+
+ + ☆ Mamba-YOLO-World: Marrying YOLO-World with Mamba for Open-Vocabulary + Detection + + +
+ Open-vocabulary detection (OVD) aims to detect objects beyond a predefined +set of categories. As a pioneering model incorporating the YOLO series into +OVD, YOLO-World is well-suited for scenarios prioritizing speed and +efficiency.However, its performance is hindered by its neck feature fusion +mechanism, which causes the quadratic complexity and the limited guided +receptive fields.To address these limitations, we present Mamba-YOLO-World, a +novel YOLO-based OVD model employing the proposed MambaFusion Path Aggregation +Network (MambaFusion-PAN) as its neck architecture. Specifically, we introduce +an innovative State Space Model-based feature fusion mechanism consisting of a +Parallel-Guided Selective Scan algorithm and a Serial-Guided Selective Scan +algorithm with linear complexity and globally guided receptive fields. It +leverages multi-modal input sequences and mamba hidden states to guide the +selective scanning process.Experiments demonstrate that our model outperforms +the original YOLO-World on the COCO and LVIS benchmarks in both zero-shot and +fine-tuning settings while maintaining comparable parameters and FLOPs. +Additionally, it surpasses existing state-of-the-art OVD methods with fewer +parameters and FLOPs. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ CasDyF-Net: Image Dehazing via Cascaded Dynamic Filters + + +
+ Image dehazing aims to restore image clarity and visual quality by reducing +atmospheric scattering and absorption effects. While deep learning has made +significant strides in this area, more and more methods are constrained by +network depth. Consequently, lots of approaches have adopted parallel branching +strategies. however, they often prioritize aspects such as resolution, +receptive field, or frequency domain segmentation without dynamically +partitioning branches based on the distribution of input features. Inspired by +dynamic filtering, we propose using cascaded dynamic filters to create a +multi-branch network by dynamically generating filter kernels based on feature +map distribution. To better handle branch features, we propose a residual +multiscale block (RMB), combining different receptive fields. Furthermore, we +also introduce a dynamic convolution-based local fusion method to merge +features from adjacent branches. Experiments on RESIDE, Haze4K, and O-Haze +datasets validate our method's effectiveness, with our model achieving a PSNR +of 43.21dB on the RESIDE-Indoor dataset. The code is available at +https://github.com/dauing/CasDyF-Net. + +
+
+ comment: 9 pages, 9 figures +
+
+
+
+
+ + ☆ Exploiting Supervised Poison Vulnerability to Strengthen Self-Supervised + Defense + + +
+ Availability poisons exploit supervised learning (SL) algorithms by +introducing class-related shortcut features in images such that models trained +on poisoned data are useless for real-world datasets. Self-supervised learning +(SSL), which utilizes augmentations to learn instance discrimination, is +regarded as a strong defense against poisoned data. However, by extending the +study of SSL across multiple poisons on the CIFAR-10 and ImageNet-100 datasets, +we demonstrate that it often performs poorly, far below that of training on +clean data. Leveraging the vulnerability of SL to poison attacks, we introduce +adversarial training (AT) on SL to obfuscate poison features and guide robust +feature learning for SSL. Our proposed defense, designated VESPR (Vulnerability +Exploitation of Supervised Poisoning for Robust SSL), surpasses the performance +of six previous defenses across seven popular availability poisons. VESPR +displays superior performance over all previous defenses, boosting the minimum +and average ImageNet-100 test accuracies of poisoned models by 16% and 9%, +respectively. Through analysis and ablation studies, we elucidate the +mechanisms by which VESPR learns robust class features. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ☆ Identifying Human Indoor Daily Life Behavior employing Thermal Sensor + Arrays (TSAs) + + +
+ Daily activity monitoring systems used in households provide vital +information for health status, particularly with aging residents. Multiple +approaches have been introduced to achieve such goals, typically obtrusive and +non-obtrusive. Amongst the obtrusive approaches are the wearable devices, and +among the non-obtrusive approaches are the movement detection systems, +including motion sensors and thermal sensor arrays (TSAs). TSA systems are +advantageous when preserving a person's privacy and picking his precise spatial +location. In this study, human daily living activities were monitored day and +night, constructing the corresponding activity time series and spatial +probability distribution and employing a TSA system. The monitored activities +are classified into two categories: sleeping and daily activity. Results showed +the possibility of distinguishing between classes regardless of day and night. +The obtained sleep activity duration was compared with previous research using +the same raw data. Results showed that the duration of sleep activity, on +average, was 9 hours/day, and daily life activity was 7 hours/day. The person's +spatial probability distribution was determined using the bivariate +distribution for the monitored location. In conclusion, the results showed that +sleeping activity was dominant. Our study showed that TSAs were the optimum +choice when monitoring human activity. Our proposed approach tackled +limitations encountered by previous human activity monitoring systems, such as +preserving human privacy while knowing his precise spatial location. + +
+
+
+
+
+ + ☆ PSTNet: Enhanced Polyp Segmentation with Multi-scale Alignment and + Frequency Domain Integration + + +
+ Accurate segmentation of colorectal polyps in colonoscopy images is crucial +for effective diagnosis and management of colorectal cancer (CRC). However, +current deep learning-based methods primarily rely on fusing RGB information +across multiple scales, leading to limitations in accurately identifying polyps +due to restricted RGB domain information and challenges in feature misalignment +during multi-scale aggregation. To address these limitations, we propose the +Polyp Segmentation Network with Shunted Transformer (PSTNet), a novel approach +that integrates both RGB and frequency domain cues present in the images. +PSTNet comprises three key modules: the Frequency Characterization Attention +Module (FCAM) for extracting frequency cues and capturing polyp +characteristics, the Feature Supplementary Alignment Module (FSAM) for aligning +semantic information and reducing misalignment noise, and the Cross Perception +localization Module (CPM) for synergizing frequency cues with high-level +semantics to achieve efficient polyp segmentation. Extensive experiments on +challenging datasets demonstrate PSTNet's significant improvement in polyp +segmentation accuracy across various metrics, consistently outperforming +state-of-the-art methods. The integration of frequency domain cues and the +novel architectural design of PSTNet contribute to advancing computer-assisted +polyp segmentation, facilitating more accurate diagnosis and management of CRC. + +
+
+
+
+
+ + ☆ Cross-conditioned Diffusion Model for Medical Image to Image Translation + + +
+ Multi-modal magnetic resonance imaging (MRI) provides rich, complementary +information for analyzing diseases. However, the practical challenges of +acquiring multiple MRI modalities, such as cost, scan time, and safety +considerations, often result in incomplete datasets. This affects both the +quality of diagnosis and the performance of deep learning models trained on +such data. Recent advancements in generative adversarial networks (GANs) and +denoising diffusion models have shown promise in natural and medical +image-to-image translation tasks. However, the complexity of training GANs and +the computational expense associated with diffusion models hinder their +development and application in this task. To address these issues, we introduce +a Cross-conditioned Diffusion Model (CDM) for medical image-to-image +translation. The core idea of CDM is to use the distribution of target +modalities as guidance to improve synthesis quality while achieving higher +generation efficiency compared to conventional diffusion models. First, we +propose a Modality-specific Representation Model (MRM) to model the +distribution of target modalities. Then, we design a Modality-decoupled +Diffusion Network (MDN) to efficiently and effectively learn the distribution +from MRM. Finally, a Cross-conditioned UNet (C-UNet) with a Condition Embedding +module is designed to synthesize the target modalities with the source +modalities as input and the target distribution for guidance. Extensive +experiments conducted on the BraTS2023 and UPenn-GBM benchmark datasets +demonstrate the superiority of our method. + +
+
+ comment: miccai24 +
+
+
+
+
+ + ☆ WheelPoser: Sparse-IMU Based Body Pose Estimation for Wheelchair Users + + +
+ Despite researchers having extensively studied various ways to track body +pose on-the-go, most prior work does not take into account wheelchair users, +leading to poor tracking performance. Wheelchair users could greatly benefit +from this pose information to prevent injuries, monitor their health, identify +environmental accessibility barriers, and interact with gaming and VR +experiences. In this work, we present WheelPoser, a real-time pose estimation +system specifically designed for wheelchair users. Our system uses only four +strategically placed IMUs on the user's body and wheelchair, making it far more +practical than prior systems using cameras and dense IMU arrays. WheelPoser is +able to track a wheelchair user's pose with a mean joint angle error of 14.30 +degrees and a mean joint position error of 6.74 cm, more than three times +better than similar systems using sparse IMUs. To train our system, we collect +a novel WheelPoser-IMU dataset, consisting of 167 minutes of paired IMU sensor +and motion capture data of people in wheelchairs, including wheelchair-specific +motions such as propulsion and pressure relief. Finally, we explore the +potential application space enabled by our system and discuss future +opportunities. Open-source code, models, and dataset can be found here: +https://github.com/axle-lab/WheelPoser. + +
+
+ comment: Accepted by ASSETS 2024 +
+
+
+
+
+ + ☆ Tri-Plane Mamba: Efficiently Adapting Segment Anything Model for 3D + Medical Images + + +
+ General networks for 3D medical image segmentation have recently undergone +extensive exploration. Behind the exceptional performance of these networks +lies a significant demand for a large volume of pixel-level annotated data, +which is time-consuming and labor-intensive. The emergence of the Segment +Anything Model (SAM) has enabled this model to achieve superior performance in +2D medical image segmentation tasks via parameter- and data-efficient feature +adaptation. However, the introduction of additional depth channels in 3D +medical images not only prevents the sharing of 2D pre-trained features but +also results in a quadratic increase in the computational cost for adapting +SAM. To overcome these challenges, we present the Tri-Plane Mamba (TP-Mamba) +adapters tailored for the SAM, featuring two major innovations: 1) multi-scale +3D convolutional adapters, optimized for efficiently processing local +depth-level information, 2) a tri-plane mamba module, engineered to capture +long-range depth-level representation without significantly increasing +computational costs. This approach achieves state-of-the-art performance in 3D +CT organ segmentation tasks. Remarkably, this superior performance is +maintained even with scarce training data. Specifically using only three CT +training samples from the BTCV dataset, it surpasses conventional 3D +segmentation networks, attaining a Dice score that is up to 12% higher. + +
+
+
+
+
+ + ☆ Risks When Sharing LoRA Fine-Tuned Diffusion Model Weights + + +
+ With the emerging trend in generative models and convenient public access to +diffusion models pre-trained on large datasets, users can fine-tune these +models to generate images of personal faces or items in new contexts described +by natural language. Parameter efficient fine-tuning (PEFT) such as Low Rank +Adaptation (LoRA) has become the most common way to save memory and computation +usage on the user end during fine-tuning. However, a natural question is +whether the private images used for fine-tuning will be leaked to adversaries +when sharing model weights. In this paper, we study the issue of privacy +leakage of a fine-tuned diffusion model in a practical setting, where +adversaries only have access to model weights, rather than prompts or images +used for fine-tuning. We design and build a variational network autoencoder +that takes model weights as input and outputs the reconstruction of private +images. To improve the efficiency of training such an autoencoder, we propose a +training paradigm with the help of timestep embedding. The results give a +surprising answer to this research question: an adversary can generate images +containing the same identities as the private images. Furthermore, we +demonstrate that no existing defense method, including differential +privacy-based methods, can preserve the privacy of private data used for +fine-tuning a diffusion model without compromising the utility of a fine-tuned +model. + +
+
+
+
+
+ + ☆ USTC-TD: A Test Dataset and Benchmark for Image and Video Coding in + 2020s + + +
+ Image/video coding has been a remarkable research area for both academia and +industry for many years. Testing datasets, especially high-quality image/video +datasets are desirable for the justified evaluation of coding-related research, +practical applications, and standardization activities. We put forward a test +dataset namely USTC-TD, which has been successfully adopted in the practical +end-to-end image/video coding challenge of the IEEE International Conference on +Visual Communications and Image Processing in 2022 and 2023. USTC-TD contains +40 images at 4K spatial resolution and 10 video sequences at 1080p spatial +resolution, featuring various content due to the diverse environmental factors +(scene type, texture, motion, view) and the designed imaging factors +(illumination, shadow, lens). We quantitatively evaluate USTC-TD on different +image/video features (spatial, temporal, color, lightness), and compare it with +the previous image/video test datasets, which verifies the wider coverage and +more diversity of the proposed dataset. We also evaluate both classic +standardized and recent learned image/video coding schemes on USTC-TD with PSNR +and MS-SSIM, and provide an extensive benchmark for the evaluated schemes. +Based on the characteristics and specific design of the proposed test dataset, +we analyze the benchmark performance and shed light on the future research and +development of image/video coding. All the data are released online: +https://esakak.github.io/USTC-TD. + +
+
+ comment: 24 pages. Project Page: https://esakak.github.io/USTC-TD +
+
+
+
+
+ + ☆ RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense + Positive Supervision + + +
+ RT-DETR is the first real-time end-to-end transformer-based object detector. +Its efficiency comes from the framework design and the Hungarian matching. +However, compared to dense supervision detectors like the YOLO series, the +Hungarian matching provides much sparser supervision, leading to insufficient +model training and difficult to achieve optimal results. To address these +issues, we proposed a hierarchical dense positive supervision method based on +RT-DETR, named RT-DETRv3. Firstly, we introduce a CNN-based auxiliary branch +that provides dense supervision that collaborates with the original decoder to +enhance the encoder feature representation. Secondly, to address insufficient +decoder training, we propose a novel learning strategy involving self-attention +perturbation. This strategy diversifies label assignment for positive samples +across multiple query groups, thereby enriching positive supervisions. +Additionally, we introduce a shared-weight decoder branch for dense positive +supervision to ensure more high-quality queries matching each ground truth. +Notably, all aforementioned modules are training-only. We conduct extensive +experiments to demonstrate the effectiveness of our approach on COCO val2017. +RT-DETRv3 significantly outperforms existing real-time detectors, including the +RT-DETR series and the YOLO series. For example, RT-DETRv3-R18 achieves 48.1% +AP (+1.6%/+1.4%) compared to RT-DETR-R18/RT-DETRv2-R18 while maintaining the +same latency. Meanwhile, it requires only half of epochs to attain a comparable +performance. Furthermore, RT-DETRv3-R101 can attain an impressive 54.6% AP +outperforming YOLOv10-X. Code will be released soon. + +
+
+
+
+
+ + ☆ Rethinking Meta-Learning from a Learning Lens + + +
+ Meta-learning has emerged as a powerful approach for leveraging knowledge +from previous tasks to solve new tasks. The mainstream methods focus on +training a well-generalized model initialization, which is then adapted to +different tasks with limited data and updates. However, it pushes the model +overfitting on the training tasks. Previous methods mainly attributed this to +the lack of data and used augmentations to address this issue, but they were +limited by sufficient training and effective augmentation strategies. In this +work, we focus on the more fundamental ``learning to learn'' strategy of +meta-learning to explore what causes errors and how to eliminate these errors +without changing the environment. Specifically, we first rethink the +algorithmic procedure of meta-learning from a ``learning'' lens. Through +theoretical and empirical analyses, we find that (i) this paradigm faces the +risk of both overfitting and underfitting and (ii) the model adapted to +different tasks promote each other where the effect is stronger if the tasks +are more similar. Based on this insight, we propose using task relations to +calibrate the optimization process of meta-learning and propose a plug-and-play +method called Task Relation Learner (TRLearner) to achieve this goal. +Specifically, it first obtains task relation matrices from the extracted +task-specific meta-data. Then, it uses the obtained matrices with +relation-aware consistency regularization to guide optimization. Extensive +theoretical and empirical analyses demonstrate the effectiveness of TRLearner. + +
+
+
+
+
+ + ☆ Generalization Boosted Adapter for Open-Vocabulary Segmentation + + +
+ Vision-language models (VLMs) have demonstrated remarkable open-vocabulary +object recognition capabilities, motivating their adaptation for dense +prediction tasks like segmentation. However, directly applying VLMs to such +tasks remains challenging due to their lack of pixel-level granularity and the +limited data available for fine-tuning, leading to overfitting and poor +generalization. To address these limitations, we propose Generalization Boosted +Adapter (GBA), a novel adapter strategy that enhances the generalization and +robustness of VLMs for open-vocabulary segmentation. GBA comprises two core +components: (1) a Style Diversification Adapter (SDA) that decouples features +into amplitude and phase components, operating solely on the amplitude to +enrich the feature space representation while preserving semantic consistency; +and (2) a Correlation Constraint Adapter (CCA) that employs cross-attention to +establish tighter semantic associations between text categories and target +regions, suppressing irrelevant low-frequency ``noise'' information and +avoiding erroneous associations. Through the synergistic effect of the shallow +SDA and the deep CCA, GBA effectively alleviates overfitting issues and +enhances the semantic relevance of feature representations. As a simple, +efficient, and plug-and-play component, GBA can be flexibly integrated into +various CLIP-based methods, demonstrating broad applicability and achieving +state-of-the-art performance on multiple open-vocabulary segmentation +benchmarks. + +
+
+
+
+
+ + ☆ VLTP: Vision-Language Guided Token Pruning for Task-Oriented + Segmentation + + +
+ Vision Transformers (ViTs) have emerged as the backbone of many segmentation +models, consistently achieving state-of-the-art (SOTA) performance. However, +their success comes at a significant computational cost. Image token pruning is +one of the most effective strategies to address this complexity. However, +previous approaches fall short when applied to more complex task-oriented +segmentation (TOS), where the class of each image patch is not predefined but +dependent on the specific input task. This work introduces the Vision Language +Guided Token Pruning (VLTP), a novel token pruning mechanism that can +accelerate ViTbased segmentation models, particularly for TOS guided by +multi-modal large language model (MLLM). We argue that ViT does not need to +process every image token through all of its layers only the tokens related to +reasoning tasks are necessary. We design a new pruning decoder to take both +image tokens and vision-language guidance as input to predict the relevance of +each image token to the task. Only image tokens with high relevance are passed +to deeper layers of the ViT. Experiments show that the VLTP framework reduces +the computational costs of ViT by approximately 25% without performance +degradation and by around 40% with only a 1% performance drop. + +
+
+
+
+
+ + ☆ VistaFormer: Scalable Vision Transformers for Satellite Image Time + Series Segmentation + + +
+ We introduce VistaFormer, a lightweight Transformer-based model architecture +for the semantic segmentation of remote-sensing images. This model uses a +multi-scale Transformer-based encoder with a lightweight decoder that +aggregates global and local attention captured in the encoder blocks. +VistaFormer uses position-free self-attention layers which simplifies the model +architecture and removes the need to interpolate temporal and spatial codes, +which can reduce model performance when training and testing image resolutions +differ. We investigate simple techniques for filtering noisy input signals like +clouds and demonstrate that improved model scalability can be achieved by +substituting Multi-Head Self-Attention (MHSA) with Neighbourhood Attention +(NA). Experiments on the PASTIS and MTLCC crop-type segmentation benchmarks +show that VistaFormer achieves better performance than comparable models and +requires only 8% of the floating point operations using MHSA and 11% using NA +while also using fewer trainable parameters. VistaFormer with MHSA improves on +state-of-the-art mIoU scores by 0.1% on the PASTIS benchmark and 3% on the +MTLCC benchmark while VistaFormer with NA improves on the MTLCC benchmark by +3.7%. + +
+
+
+
+
+ + ☆ Towards Unified Facial Action Unit Recognition Framework by Large + Language Models + + +
+ Facial Action Units (AUs) are of great significance in the realm of affective +computing. In this paper, we propose AU-LLaVA, the first unified AU recognition +framework based on the Large Language Model (LLM). AU-LLaVA consists of a +visual encoder, a linear projector layer, and a pre-trained LLM. We +meticulously craft the text descriptions and fine-tune the model on various AU +datasets, allowing it to generate different formats of AU recognition results +for the same input image. On the BP4D and DISFA datasets, AU-LLaVA delivers the +most accurate recognition results for nearly half of the AUs. Our model +achieves improvements of F1-score up to 11.4% in specific AU recognition +compared to previous benchmark results. On the FEAFA dataset, our method +achieves significant improvements over all 24 AUs compared to previous +benchmark results. AU-LLaVA demonstrates exceptional performance and +versatility in AU recognition. + +
+
+
+
+
+ + ☆ CF-PRNet: Coarse-to-Fine Prototype Refining Network for Point Cloud + Completion and Reconstruction ECCV2024 + + +
+ In modern agriculture, precise monitoring of plants and fruits is crucial for +tasks such as high-throughput phenotyping and automated harvesting. This paper +addresses the challenge of reconstructing accurate 3D shapes of fruits from +partial views, which is common in agricultural settings. We introduce CF-PRNet, +a coarse-to-fine prototype refining network, leverages high-resolution 3D data +during the training phase but requires only a single RGB-D image for real-time +inference. Our approach begins by extracting the incomplete point cloud data +that constructed from a partial view of a fruit with a series of convolutional +blocks. The extracted features inform the generation of scaling vectors that +refine two sequentially constructed 3D mesh prototypes - one coarse and one +fine-grained. This progressive refinement facilitates the detailed completion +of the final point clouds, achieving detailed and accurate reconstructions. +CF-PRNet demonstrates excellent performance metrics with a Chamfer Distance of +3.78, an F1 Score of 66.76%, a Precision of 56.56%, and a Recall of 85.31%, and +win the first place in the Shape Completion and Reconstruction of Sweet Peppers +Challenge. + +
+
+ comment: Technical Report of the 1st place solution to CVPPA@ECCV2024: Shape + Completion and Reconstruction of Sweet Peppers Challenge +
+
+
+
+
+ + ♻ ☆ NPGA: Neural Parametric Gaussian Avatars + + +
+ The creation of high-fidelity, digital versions of human heads is an +important stepping stone in the process of further integrating virtual +components into our everyday lives. Constructing such avatars is a challenging +research problem, due to a high demand for photo-realism and real-time +rendering performance. In this work, we propose Neural Parametric Gaussian +Avatars (NPGA), a data-driven approach to create high-fidelity, controllable +avatars from multi-view video recordings. We build our method around 3D +Gaussian splatting for its highly efficient rendering and to inherit the +topological flexibility of point clouds. In contrast to previous work, we +condition our avatars' dynamics on the rich expression space of neural +parametric head models (NPHM), instead of mesh-based 3DMMs. To this end, we +distill the backward deformation field of our underlying NPHM into forward +deformations which are compatible with rasterization-based rendering. All +remaining fine-scale, expression-dependent details are learned from the +multi-view videos. For increased representational capacity of our avatars, we +propose per-Gaussian latent features that condition each primitives dynamic +behavior. To regularize this increased dynamic expressivity, we propose +Laplacian terms on the latent features and predicted dynamics. We evaluate our +method on the public NeRSemble dataset, demonstrating that NPGA significantly +outperforms the previous state-of-the-art avatars on the self-reenactment task +by 2.6 PSNR. Furthermore, we demonstrate accurate animation capabilities from +real-world monocular videos. + +
+
+ comment: Project Page: see https://simongiebenhain.github.io/NPGA/ ; Youtube + Video: see https://youtu.be/t0S0OK7WnA4 +
+
+
+
+
+ + ♻ ☆ Pseudo-Prompt Generating in Pre-trained Vision-Language Models for + Multi-Label Medical Image Classification + + +
+ The task of medical image recognition is notably complicated by the presence +of varied and multiple pathological indications, presenting a unique challenge +in multi-label classification with unseen labels. This complexity underlines +the need for computer-aided diagnosis methods employing multi-label zero-shot +learning. Recent advancements in pre-trained vision-language models (VLMs) have +showcased notable zero-shot classification abilities on medical images. +However, these methods have limitations on leveraging extensive pre-trained +knowledge from broader image datasets, and often depend on manual prompt +construction by expert radiologists. By automating the process of prompt +tuning, prompt learning techniques have emerged as an efficient way to adapt +VLMs to downstream tasks. Yet, existing CoOp-based strategies fall short in +performing class-specific prompts on unseen categories, limiting +generalizability in fine-grained scenarios. To overcome these constraints, we +introduce a novel prompt generation approach inspirited by text generation in +natural language processing (NLP). Our method, named Pseudo-Prompt Generating +(PsPG), capitalizes on the priori knowledge of multi-modal features. Featuring +a RNN-based decoder, PsPG autoregressively generates class-tailored embedding +vectors, i.e., pseudo-prompts. Comparative evaluations on various multi-label +chest radiograph datasets affirm the superiority of our approach against +leading medical vision-language and multi-label prompt learning methods. The +source code is available at https://github.com/fallingnight/PsPG + +
+
+ comment: Accepted by PRCV 2024 +
+
+
+
+
+ + ♻ ☆ Does a Neural Network Really Encode Symbolic Concepts? + + +
+ Recently, a series of studies have tried to extract interactions between +input variables modeled by a DNN and define such interactions as concepts +encoded by the DNN. However, strictly speaking, there still lacks a solid +guarantee whether such interactions indeed represent meaningful concepts. +Therefore, in this paper, we examine the trustworthiness of interaction +concepts from four perspectives. Extensive empirical studies have verified that +a well-trained DNN usually encodes sparse, transferable, and discriminative +concepts, which is partially aligned with human intuition. + +
+
+
+
+
+ + ♻ ☆ Pavlok-Nudge: A Feedback Mechanism for Atomic Behaviour Modification + with Snoring Usecase + + +
+ This paper proposes a feedback mechanism to change behavioural patterns using +the Pavlok device. Pavlok utilises beeps, vibration and shocks as a mode of +aversion technique to help individuals with behaviour modification. While the +device can be useful in certain periodic daily life situations, like alarms and +exercise notifications, the device relies on manual operations that limit its +usage. To automate behaviour modification, we propose a framework that first +detects targeted behaviours through a lightweight deep learning model and +subsequently nudges the user through Pavlok. Our proposed solution is +implemented and verified in the context of snoring, which captures audio from +the environment following a prediction of whether the audio content is a snore +or not using a 1D convolutional neural network. Based on the prediction, we use +Pavlok to nudge users for preventive measures, such as a change in sleeping +posture. We believe that this simple solution can help people to change their +atomic habits, which may lead to long-term health benefits. Our proposed +real-time, lightweight model (99.8% less parameters over SOTA; 1,278,049 --> +1337) achieves SOTA performance (test accuracy of 0.99) on a public domain +benchmark. The code and model are publicly available at +https://github.com/hasan-rakibul/pavlok-nudge-snore. + +
+
+ comment: Md Rakibul Hasan and Shreya Ghosh are co-first authors +
+
+
+
+
+ + ♻ ☆ IBoxCLA: Towards Robust Box-supervised Segmentation of Polyp via + Improved Box-dice and Contrastive Latent-anchors + + +
+ Box-supervised polyp segmentation attracts increasing attention for its +cost-effective potential. Existing solutions often rely on learning-free +methods or pretrained models to laboriously generate pseudo masks, triggering +Dice constraint subsequently. In this paper, we found that a model guided by +the simplest box-filled masks can accurately predict polyp locations/sizes, but +suffers from shape collapsing. In response, we propose two innovative learning +fashions, Improved Box-dice (IBox) and Contrastive Latent-Anchors (CLA), and +combine them to train a robust box-supervised model IBoxCLA. The core idea +behind IBoxCLA is to decouple the learning of location/size and shape, allowing +for focused constraints on each of them. Specifically, IBox transforms the +segmentation map into a proxy map using shape decoupling and confusion-region +swapping sequentially. Within the proxy map, shapes are disentangled, while +locations/sizes are encoded as box-like responses. By constraining the proxy +map instead of the raw prediction, the box-filled mask can well supervise +IBoxCLA without misleading its shape learning. Furthermore, CLA contributes to +shape learning by generating two types of latent anchors, which are learned and +updated using momentum and segmented polyps to steadily represent polyp and +background features. The latent anchors facilitate IBoxCLA to capture +discriminative features within and outside boxes in a contrastive manner, +yielding clearer boundaries. We benchmark IBoxCLA on five public polyp +datasets. The experimental results demonstrate the competitive performance of +IBoxCLA compared to recent fully-supervised polyp segmentation methods, and its +superiority over other box-supervised state-of-the-arts with a relative +increase of overall mDice and mIoU by at least 6.5% and 7.5%, respectively. + +
+
+
+
+
+ + ♻ ☆ ODYSSEE: Oyster Detection Yielded by Sensor Systems on Edge Electronics + + +
+ Oysters are a vital keystone species in coastal ecosystems, providing +significant economic, environmental, and cultural benefits. As the importance +of oysters grows, so does the relevance of autonomous systems for their +detection and monitoring. However, current monitoring strategies often rely on +destructive methods. While manual identification of oysters from video footage +is non-destructive, it is time-consuming, requires expert input, and is further +complicated by the challenges of the underwater environment. + To address these challenges, we propose a novel pipeline using stable +diffusion to augment a collected real dataset with realistic synthetic data. +This method enhances the dataset used to train a YOLOv10-based vision model. +The model is then deployed and tested on an edge platform in underwater +robotics, achieving a state-of-the-art 0.657 mAP@50 for oyster detection on the +Aqua2 platform. + +
+
+
+
+
+ + ♻ ☆ MonoPCC: Photometric-invariant Cycle Constraint for Monocular Depth + Estimation of Endoscopic Images + + +
+ Photometric constraint is indispensable for self-supervised monocular depth +estimation. It involves warping a source image onto a target view using +estimated depth&pose, and then minimizing the difference between the warped and +target images. However, the endoscopic built-in light causes significant +brightness fluctuations, and thus makes the photometric constraint unreliable. +Previous efforts only mitigate this relying on extra models to calibrate image +brightness. In this paper, we propose MonoPCC to address the brightness +inconsistency radically by reshaping the photometric constraint into a cycle +form. Instead of only warping the source image, MonoPCC constructs a closed +loop consisting of two opposite forward-backward warping paths: from target to +source and then back to target. Thus, the target image finally receives an +image cycle-warped from itself, which naturally makes the constraint invariant +to brightness changes. Moreover, MonoPCC transplants the source image's +phase-frequency into the intermediate warped image to avoid structure lost, and +also stabilizes the training via an exponential moving average (EMA) strategy +to avoid frequent changes in the forward warping. The comprehensive and +extensive experimental results on four endoscopic datasets demonstrate that our +proposed MonoPCC shows a great robustness to the brightness inconsistency, and +exceeds other state-of-the-arts by reducing the absolute relative error by at +least 7.27%, 9.38%, 9.90% and 3.17%, respectively. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ RTF-Q: Efficient Unsupervised Domain Adaptation with Retraining-free + Quantization + + +
+ Performing unsupervised domain adaptation on resource-constrained edge +devices is challenging. Existing research typically adopts architecture +optimization (e.g., designing slimmable networks) but requires expensive +training costs. Moreover, it does not consider the considerable precision +redundancy of parameters and activations. To address these limitations, we +propose efficient unsupervised domain adaptation with ReTraining-Free +Quantization (RTF-Q). Our approach uses low-precision quantization +architectures with varying computational costs, adapting to devices with +dynamic computation budgets. We subtly configure subnet dimensions and leverage +weight-sharing to optimize multiple architectures within a single set of +weights, enabling the use of pre-trained models from open-source repositories. +Additionally, we introduce multi-bitwidth joint training and the SandwichQ +rule, both of which are effective in handling multiple quantization bit-widths +across subnets. Experimental results demonstrate that our network achieves +competitive accuracy with state-of-the-art methods across three benchmarks +while significantly reducing memory and computational costs. + +
+
+
+
+
+ + ♻ ☆ Defining and Extracting generalizable interaction primitives from DNNs + + +
+ Faithfully summarizing the knowledge encoded by a deep neural network (DNN) +into a few symbolic primitive patterns without losing much information +represents a core challenge in explainable AI. To this end, Ren et al. (2024) +have derived a series of theorems to prove that the inference score of a DNN +can be explained as a small set of interactions between input variables. +However, the lack of generalization power makes it still hard to consider such +interactions as faithful primitive patterns encoded by the DNN. Therefore, +given different DNNs trained for the same task, we develop a new method to +extract interactions that are shared by these DNNs. Experiments show that the +extracted interactions can better reflect common knowledge shared by different +DNNs. + +
+
+
+
+
+ + ♻ ☆ XAMI -- A Benchmark Dataset for Artefact Detection in XMM-Newton Optical + Images SP + + +
+ Reflected or scattered light produce artefacts in astronomical observations +that can negatively impact the scientific study. Hence, automated detection of +these artefacts is highly beneficial, especially with the increasing amounts of +data gathered. Machine learning methods are well-suited to this problem, but +currently there is a lack of annotated data to train such approaches to detect +artefacts in astronomical observations. In this work, we present a dataset of +images from the XMM-Newton space telescope Optical Monitoring camera showing +different types of artefacts. We hand-annotated a sample of 1000 images with +artefacts which we use to train automated ML methods. We further demonstrate +techniques tailored for accurate detection and masking of artefacts using +instance segmentation. We adopt a hybrid approach, combining knowledge from +both convolutional neural networks (CNNs) and transformer-based models and use +their advantages in segmentation. The presented method and dataset will advance +artefact detection in astronomical observations by providing a reproducible +baseline. All code and data are made available +(https://github.com/ESA-Datalabs/XAMI-model and +https://github.com/ESA-Datalabs/XAMI-dataset). + +
+
+ comment: Accepted for oral presentation at SPAICE 2024 +
+
+
+
+
+ + ♻ ☆ Exclusive Style Removal for Cross Domain Novel Class Discovery + + +
+ As a promising field in open-world learning, \textit{Novel Class Discovery} +(NCD) is usually a task to cluster unseen novel classes in an unlabeled set +based on the prior knowledge of labeled data within the same domain. However, +the performance of existing NCD methods could be severely compromised when +novel classes are sampled from a different distribution with the labeled ones. +In this paper, we explore and establish the solvability of NCD in cross domain +setting with the necessary condition that style information must be removed. +Based on the theoretical analysis, we introduce an exclusive style removal +module for extracting style information that is distinctive from the baseline +features, thereby facilitating inference. Moreover, this module is easy to +integrate with other NCD methods, acting as a plug-in to improve performance on +novel classes with different distributions compared to the seen labeled set. +Additionally, recognizing the non-negligible influence of different backbones +and pre-training strategies on the performance of the NCD methods, we build a +fair benchmark for future NCD research. Extensive experiments on three common +datasets demonstrate the effectiveness of our proposed module. + +
+
+
+
+
+ + ♻ ☆ LTRL: Boosting Long-tail Recognition via Reflective Learning ECCV2024 + + +
+ In real-world scenarios, where knowledge distributions exhibit long-tail. +Humans manage to master knowledge uniformly across imbalanced distributions, a +feat attributed to their diligent practices of reviewing, summarizing, and +correcting errors. Motivated by this learning process, we propose a novel +learning paradigm, called reflecting learning, in handling long-tail +recognition. Our method integrates three processes for reviewing past +predictions during training, summarizing and leveraging the feature relation +across classes, and correcting gradient conflict for loss functions. These +designs are lightweight enough to plug and play with existing long-tail +learning methods, achieving state-of-the-art performance in popular long-tail +visual benchmarks. The experimental results highlight the great potential of +reflecting learning in dealing with long-tail recognition. + +
+
+ comment: ECCV2024, Oral +
+
+
+
+
+ + ♻ ☆ Diffusion-driven lensless fiber endomicroscopic quantitative phase + imaging towards digital pathology + + +
+ Lensless fiber endomicroscope is an emerging tool for in-vivo microscopic +imaging, where quantitative phase imaging (QPI) can be utilized as a label-free +method to enhance image contrast. However, existing single-shot phase +reconstruction methods through lensless fiber endomicroscope typically perform +well on simple images but struggle with complex microscopic structures. Here, +we propose a speckle-conditioned diffusion model (SpecDiffusion), which +reconstructs phase images directly from speckles captured at the detection side +of a multi-core fiber (MCF). Unlike conventional neural networks, SpecDiffusion +employs iterative phase denoising steps for speckle-driven phase +reconstruction. The iteration scheme allows SpecDiffusion to break down the +phase reconstruction process into multiple steps, gradually building up to the +final phase image. This attribute alleviates the computation challenge at each +step and enables the reconstruction of rich details in complex microscopic +images. To validate its efficacy, we build an optical system to capture +speckles from MCF and construct a dataset consisting of 100,000 paired images. +SpecDiffusion provides high-fidelity phase reconstruction results and shows +powerful generalization capacity for unseen objects, such as test charts and +biological tissues, reducing the average mean absolute error of the +reconstructed tissue images by 7 times. Furthermore, the reconstructed tissue +images using SpecDiffusion shows higher accuracy in zero-shot cell segmentation +tasks compared to the conventional method, demonstrating the potential for +further cell morphology analysis through the learning-based lensless fiber +endomicroscope. SpecDiffusion offers a precise and generalized method to phase +reconstruction through scattering media, including MCFs, opening new +perspective in lensless fiber endomicroscopic imaging. + +
+
+
+
+
+ + ♻ ☆ Exploring the Effectiveness of Object-Centric Representations in Visual + Question Answering: Comparative Insights with Foundation Models + + +
+ Object-centric (OC) representations, which represent the state of a visual +scene by modeling it as a composition of objects, have the potential to be used +in various downstream tasks to achieve systematic compositional generalization +and facilitate reasoning. However, these claims have not been thoroughly +analyzed yet. Recently, foundation models have demonstrated unparalleled +capabilities across diverse domains from language to computer vision, marking +them as a potential cornerstone of future research for a multitude of +computational tasks. In this paper, we conduct an extensive empirical study on +representation learning for downstream Visual Question Answering (VQA), which +requires an accurate compositional understanding of the scene. We thoroughly +investigate the benefits and trade-offs of OC models and alternative approaches +including large pre-trained foundation models on both synthetic and real-world +data, and demonstrate a viable way to achieve the best of both worlds. The +extensiveness of our study, encompassing over 800 downstream VQA models and 15 +different types of upstream representations, also provides several additional +insights that we believe will be of interest to the community at large. + +
+
+
+
+
+ + ♻ ☆ Efficient Image Super-Resolution with Feature Interaction Weighted + Hybrid Network AAAI2022 + + +
+ Lightweight image super-resolution aims to reconstruct high-resolution images +from low-resolution images using low computational costs. However, existing +methods result in the loss of middle-layer features due to activation +functions. To minimize the impact of intermediate feature loss on +reconstruction quality, we propose a Feature Interaction Weighted Hybrid +Network (FIWHN), which comprises a series of Wide-residual Distillation +Interaction Block (WDIB) as the backbone. Every third WDIB forms a Feature +Shuffle Weighted Group (FSWG) by applying mutual information shuffle and +fusion. Moreover, to mitigate the negative effects of intermediate feature +loss, we introduce Wide Residual Weighting units within WDIB. These units +effectively fuse features of varying levels of detail through a Wide-residual +Distillation Connection (WRDC) and a Self-Calibrating Fusion (SCF). To +compensate for global feature deficiencies, we incorporate a Transformer and +explore a novel architecture to combine CNN and Transformer. We show that our +FIWHN achieves a favorable balance between performance and efficiency through +extensive experiments on low-level and high-level tasks. Codes will be +available at \url{https://github.com/IVIPLab/FIWHN}. + +
+
+ comment: 12 pages, 12 figures, IEEE Transactions on Multimedia (extention of + our AAAI2022) +
+
+
+
+
+ + ♻ ☆ FaçAID: A Transformer Model for Neuro-Symbolic Facade Reconstruction SIGGRAPH + + +
+ We introduce a neuro-symbolic transformer-based model that converts flat, +segmented facade structures into procedural definitions using a custom-designed +split grammar. To facilitate this, we first develop a semi-complex split +grammar tailored for architectural facades and then generate a dataset +comprising of facades alongside their corresponding procedural representations. +This dataset is used to train our transformer model to convert segmented, flat +facades into the procedural language of our grammar. During inference, the +model applies this learned transformation to new facade segmentations, +providing a procedural representation that users can adjust to generate varied +facade designs. This method not only automates the conversion of static facade +images into dynamic, editable procedural formats but also enhances the design +flexibility, allowing for easy modifications. + +
+
+ comment: 11 pages, 11 figures, in ACM SIGGRAPH Asia 2024 Conference Papers + Proceedings +
+
+
+
+
+ + ♻ ☆ A Closer Look at GAN Priors: Exploiting Intermediate Features for + Enhanced Model Inversion Attacks ECCV 2024 + + +
+ Model Inversion (MI) attacks aim to reconstruct privacy-sensitive training +data from released models by utilizing output information, raising extensive +concerns about the security of Deep Neural Networks (DNNs). Recent advances in +generative adversarial networks (GANs) have contributed significantly to the +improved performance of MI attacks due to their powerful ability to generate +realistic images with high fidelity and appropriate semantics. However, +previous MI attacks have solely disclosed private information in the latent +space of GAN priors, limiting their semantic extraction and transferability +across multiple target models and datasets. To address this challenge, we +propose a novel method, Intermediate Features enhanced Generative Model +Inversion (IF-GMI), which disassembles the GAN structure and exploits features +between intermediate blocks. This allows us to extend the optimization space +from latent code to intermediate features with enhanced expressive +capabilities. To prevent GAN priors from generating unrealistic images, we +apply a L1 ball constraint to the optimization process. Experiments on multiple +benchmarks demonstrate that our method significantly outperforms previous +approaches and achieves state-of-the-art results under various settings, +especially in the out-of-distribution (OOD) scenario. Our code is available at: +https://github.com/final-solution/IF-GMI + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Where We Have Arrived in Proving the Emergence of Sparse Symbolic + Concepts in AI Models + + +
+ This study aims to prove the emergence of symbolic concepts (or more +precisely, sparse primitive inference patterns) in well-trained deep neural +networks (DNNs). Specifically, we prove the following three conditions for the +emergence. (i) The high-order derivatives of the network output with respect to +the input variables are all zero. (ii) The DNN can be used on occluded samples +and when the input sample is less occluded, the DNN will yield higher +confidence. (iii) The confidence of the DNN does not significantly degrade on +occluded samples. These conditions are quite common, and we prove that under +these conditions, the DNN will only encode a relatively small number of sparse +interactions between input variables. Moreover, we can consider such +interactions as symbolic primitive inference patterns encoded by a DNN, because +we show that inference scores of the DNN on an exponentially large number of +randomly masked samples can always be well mimicked by numerical effects of +just a few interactions. + +
+
+
+
+
+ + ♻ ☆ Explaining Generalization Power of a DNN Using Interactive Concepts + + +
+ This paper explains the generalization power of a deep neural network (DNN) +from the perspective of interactions. Although there is no universally accepted +definition of the concepts encoded by a DNN, the sparsity of interactions in a +DNN has been proved, i.e., the output score of a DNN can be well explained by a +small number of interactions between input variables. In this way, to some +extent, we can consider such interactions as interactive concepts encoded by +the DNN. Therefore, in this paper, we derive an analytic explanation of +inconsistency of concepts of different complexities. This may shed new lights +on using the generalization power of concepts to explain the generalization +power of the entire DNN. Besides, we discover that the DNN with stronger +generalization power usually learns simple concepts more quickly and encodes +fewer complex concepts. We also discover the detouring dynamics of learning +complex concepts, which explains both the high learning difficulty and the low +generalization power of complex concepts. The code will be released when the +paper is accepted. + +
+
+
+
+
+ + ♻ ☆ Estimating Atmospheric Variables from Digital Typhoon Satellite Images + via Conditional Denoising Diffusion Models + + +
+ This study explores the application of diffusion models in the field of +typhoons, predicting multiple ERA5 meteorological variables simultaneously from +Digital Typhoon satellite images. The focus of this study is taken to be +Taiwan, an area very vulnerable to typhoons. By comparing the performance of +Conditional Denoising Diffusion Probability Model (CDDPM) with Convolutional +Neural Networks (CNN) and Squeeze-and-Excitation Networks (SENet), results +suggest that the CDDPM performs best in generating accurate and realistic +meteorological data. Specifically, CDDPM achieved a PSNR of 32.807, which is +approximately 7.9% higher than CNN and 5.5% higher than SENet. Furthermore, +CDDPM recorded an RMSE of 0.032, showing a 11.1% improvement over CNN and 8.6% +improvement over SENet. A key application of this research can be for +imputation purposes in missing meteorological datasets and generate additional +high-quality meteorological data using satellite images. It is hoped that the +results of this analysis will enable more robust and detailed forecasting, +reducing the impact of severe weather events on vulnerable regions. Code +accessible at https://github.com/TammyLing/Typhoon-forecasting. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Efficient Diffusion Model for Image Restoration by Residual Shifting + + +
+ While diffusion-based image restoration (IR) methods have achieved remarkable +success, they are still limited by the low inference speed attributed to the +necessity of executing hundreds or even thousands of sampling steps. Existing +acceleration sampling techniques, though seeking to expedite the process, +inevitably sacrifice performance to some extent, resulting in over-blurry +restored outcomes. To address this issue, this study proposes a novel and +efficient diffusion model for IR that significantly reduces the required number +of diffusion steps. Our method avoids the need for post-acceleration during +inference, thereby avoiding the associated performance deterioration. +Specifically, our proposed method establishes a Markov chain that facilitates +the transitions between the high-quality and low-quality images by shifting +their residuals, substantially improving the transition efficiency. A carefully +formulated noise schedule is devised to flexibly control the shifting speed and +the noise strength during the diffusion process. Extensive experimental +evaluations demonstrate that the proposed method achieves superior or +comparable performance to current state-of-the-art methods on three classical +IR tasks, namely image super-resolution, image inpainting, and blind face +restoration, \textit{\textbf{even only with four sampling steps}}. Our code and +model are publicly available at \url{https://github.com/zsyOAOA/ResShift}. + +
+
+ comment: Accepted by TPAMI@2024. Code: https://github.com/zsyOAOA/ResShift +
+
+
+
+
+ + ♻ ☆ Fisheye-Calib-Adapter: An Easy Tool for Fisheye Camera Model Conversion + + +
+ The increasing necessity for fisheye cameras in fields such as robotics and +autonomous driving has led to the proposal of various fisheye camera models. +While the evolution of camera models has facilitated the development of diverse +systems in the field, the lack of adaptation between different fisheye camera +models means that recalibration is always necessary, which is cumbersome. This +paper introduces a conversion tool for various previously proposed fisheye +camera models. It is user-friendly, simple, yet extremely fast and accurate, +offering conversion capabilities for a broader range of models compared to +existing tools. We have verified that models converted using our system perform +correctly in applications such as SLAM. By utilizing our system, researchers +can obtain output parameters directly from input parameters without the need +for an image set and any recalibration processes, thus serving as a bridge +across different fisheye camera models in various research fields. We provide +our system as an open source tool available at: +https://github.com/eowjd0512/fisheye-calib-adapter + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Deepfake Detection: A Comprehensive Survey from the Reliability + Perspective + + +
+ The mushroomed Deepfake synthetic materials circulated on the internet have +raised a profound social impact on politicians, celebrities, and individuals +worldwide. In this survey, we provide a thorough review of the existing +Deepfake detection studies from the reliability perspective. We identify three +reliability-oriented research challenges in the current Deepfake detection +domain: transferability, interpretability, and robustness. Moreover, while +solutions have been frequently addressed regarding the three challenges, the +general reliability of a detection model has been barely considered, leading to +the lack of reliable evidence in real-life usages and even for prosecutions on +Deepfake-related cases in court. We, therefore, introduce a model reliability +study metric using statistical random sampling knowledge and the publicly +available benchmark datasets to review the reliability of the existing +detection models on arbitrary Deepfake candidate suspects. Case studies are +further executed to justify the real-life Deepfake cases including different +groups of victims with the help of the reliably qualified detection models as +reviewed in this survey. Reviews and experiments on the existing approaches +provide informative discussions and future research directions for Deepfake +detection. + +
+
+
+
+
+ + ♻ ☆ DualBEV: Unifying Dual View Transformation with Probabilistic + Correspondences ECCV 2024 + + +
+ Camera-based Bird's-Eye-View (BEV) perception often struggles between +adopting 3D-to-2D or 2D-to-3D view transformation (VT). The 3D-to-2D VT +typically employs resource-intensive Transformer to establish robust +correspondences between 3D and 2D features, while the 2D-to-3D VT utilizes the +Lift-Splat-Shoot (LSS) pipeline for real-time application, potentially missing +distant information. To address these limitations, we propose DualBEV, a +unified framework that utilizes a shared feature transformation incorporating +three probabilistic measurements for both strategies. By considering dual-view +correspondences in one stage, DualBEV effectively bridges the gap between these +strategies, harnessing their individual strengths. Our method achieves +state-of-the-art performance without Transformer, delivering comparable +efficiency to the LSS approach, with 55.2% mAP and 63.4% NDS on the nuScenes +test set. Code is available at \url{https://github.com/PeidongLi/DualBEV} + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ GaussianOcc: Fully Self-supervised and Efficient 3D Occupancy Estimation + with Gaussian Splatting + + +
+ We introduce GaussianOcc, a systematic method that investigates the two +usages of Gaussian splatting for fully self-supervised and efficient 3D +occupancy estimation in surround views. First, traditional methods for +self-supervised 3D occupancy estimation still require ground truth 6D poses +from sensors during training. To address this limitation, we propose Gaussian +Splatting for Projection (GSP) module to provide accurate scale information for +fully self-supervised training from adjacent view projection. Additionally, +existing methods rely on volume rendering for final 3D voxel representation +learning using 2D signals (depth maps, semantic maps), which is both +time-consuming and less effective. We propose Gaussian Splatting from Voxel +space (GSV) to leverage the fast rendering properties of Gaussian splatting. As +a result, the proposed GaussianOcc method enables fully self-supervised (no +ground truth pose) 3D occupancy estimation in competitive performance with low +computational cost (2.7 times faster in training and 5 times faster in +rendering). The relevant code will be available in +https://github.com/GANWANSHUI/GaussianOcc.git. + +
+
+ comment: Project page: https://ganwanshui.github.io/GaussianOcc/ +
+
+
+
+
+ + ♻ ☆ Global-guided Focal Neural Radiance Field for Large-scale Scene + Rendering WACV 2025 + + +
+ Neural radiance fields~(NeRF) have recently been applied to render +large-scale scenes. However, their limited model capacity typically results in +blurred rendering results. Existing large-scale NeRFs primarily address this +limitation by partitioning the scene into blocks, which are subsequently +handled by separate sub-NeRFs. These sub-NeRFs, trained from scratch and +processed independently, lead to inconsistencies in geometry and appearance +across the scene. Consequently, the rendering quality fails to exhibit +significant improvement despite the expansion of model capacity. In this work, +we present global-guided focal neural radiance field (GF-NeRF) that achieves +high-fidelity rendering of large-scale scenes. Our proposed GF-NeRF utilizes a +two-stage (Global and Focal) architecture and a global-guided training +strategy. The global stage obtains a continuous representation of the entire +scene while the focal stage decomposes the scene into multiple blocks and +further processes them with distinct sub-encoders. Leveraging this two-stage +architecture, sub-encoders only need fine-tuning based on the global encoder, +thus reducing training complexity in the focal stage while maintaining +scene-wide consistency. Spatial information and error information from the +global stage also benefit the sub-encoders to focus on crucial areas and +effectively capture more details of large-scale scenes. Notably, our approach +does not rely on any prior knowledge about the target scene, attributing +GF-NeRF adaptable to various large-scale scene types, including street-view and +aerial-view scenes. We demonstrate that our method achieves high-fidelity, +natural rendering results on various types of large-scale datasets. Our project +page: https://shaomq2187.github.io/GF-NeRF/ + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ CerberusDet: Unified Multi-Dataset Object Detection + + +
+ Conventional object detection models are usually limited by the data on which +they were trained and by the category logic they define. With the recent rise +of Language-Visual Models, new methods have emerged that are not restricted to +these fixed categories. Despite their flexibility, such Open Vocabulary +detection models still fall short in accuracy compared to traditional models +with fixed classes. At the same time, more accurate data-specific models face +challenges when there is a need to extend classes or merge different datasets +for training. The latter often cannot be combined due to different logics or +conflicting class definitions, making it difficult to improve a model without +compromising its performance. In this paper, we introduce CerberusDet, a +framework with a multi-headed model designed for handling multiple object +detection tasks. Proposed model is built on the YOLO architecture and +efficiently shares visual features from both backbone and neck components, +while maintaining separate task heads. This approach allows CerberusDet to +perform very efficiently while still delivering optimal results. We evaluated +the model on the PASCAL VOC dataset and Objects365 dataset to demonstrate its +abilities. CerberusDet achieved state-of-the-art results with 36% less +inference time. The more tasks are trained together, the more efficient the +proposed model becomes compared to running individual models sequentially. The +training and inference code, as well as the model, are available as open-source +(https://github.com/ai-forever/CerberusDet). + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey on Deep Multimodal Learning with Missing Modality + + +
+ During multimodal model training and reasoning, data samples may miss certain +modalities and lead to compromised model performance due to sensor limitations, +cost constraints, privacy concerns, data loss, and temporal and spatial +factors. This survey provides an overview of recent progress in Multimodal +Learning with Missing Modality (MLMM), focusing on deep learning techniques. It +is the first comprehensive survey that covers the historical background and the +distinction between MLMM and standard multimodal learning setups, followed by a +detailed analysis of current MLMM methods, applications, and datasets, +concluding with a discussion about challenges and potential future directions +in the field. + +
+
+ comment: Work in progress; open to discussion; planning to submit to ACM CSUR + in September +
+
+
+
+
+ + ♻ ☆ OC4-ReID: Occluded Cloth-Changing Person Re-Identification + + +
+ The study of Cloth-Changing Person Re-identification (CC-ReID) focuses on +retrieving specific pedestrians when their clothing has changed, typically +under the assumption that the entire pedestrian images are visible. Pedestrian +images in real-world scenarios, however, are often partially obscured by +obstacles, presenting a significant challenge to existing CC-ReID systems. In +this paper, we introduce a more challenging task termed Occluded Cloth-Changing +Person Re-Identification (OC4-ReID), which simultaneously addresses two +challenges of clothing changes and occlusion. Concretely, we construct two new +datasets, Occ-LTCC and Occ-PRCC, based on original CC-ReID datasets to include +random occlusions of key pedestrians components (e.g., head, torso). Moreover, +a novel benchmark is proposed for OC4-ReID incorporating a Train-Test Micro +Granularity Screening (T2MGS) module to mitigate the influence of occlusion and +proposing a Part-Robust Triplet (PRT) loss for partial features learning. +Comprehensive experiments on the proposed datasets, as well as on two CC-ReID +benchmark datasets demonstrate the superior performance of proposed method +against other state-of-the-art methods. The codes and datasets are available +at: https://github.com/1024AILab/OC4-ReID. + +
+
+
+
+
+ + ♻ ☆ Adversarial Attacks and Defenses on Text-to-Image Diffusion Models: A + Survey + + +
+ Recently, the text-to-image diffusion model has gained considerable attention +from the community due to its exceptional image generation capability. A +representative model, Stable Diffusion, amassed more than 10 million users +within just two months of its release. This surge in popularity has facilitated +studies on the robustness and safety of the model, leading to the proposal of +various adversarial attack methods. Simultaneously, there has been a marked +increase in research focused on defense methods to improve the robustness and +safety of these models. In this survey, we provide a comprehensive review of +the literature on adversarial attacks and defenses targeting text-to-image +diffusion models. We begin with an overview of text-to-image diffusion models, +followed by an introduction to a taxonomy of adversarial attacks and an +in-depth review of existing attack methods. We then present a detailed analysis +of current defense methods that improve model robustness and safety. Finally, +we discuss ongoing challenges and explore promising future research directions. +For a complete list of the adversarial attack and defense methods covered in +this survey, please refer to our curated repository at +https://github.com/datar001/Awesome-AD-on-T2IDM. + +
+
+ comment: Accepted for Information Fusion. Related benchmarks and codes are + available at \url{https://github.com/datar001/Awesome-AD-on-T2IDM} +
+
+
+
+
+ + ♻ ☆ Semi-supervised 3D Semantic Scene Completion with 2D Vision Foundation + Model Guidance + + +
+ Accurate prediction of 3D semantic occupancy from 2D visual images is vital +in enabling autonomous agents to comprehend their surroundings for planning and +navigation. State-of-the-art methods typically employ fully supervised +approaches, necessitating a huge labeled dataset acquired through expensive +LiDAR sensors and meticulous voxel-wise labeling by human annotators. The +resource-intensive nature of this annotating process significantly hampers the +application and scalability of these methods. We introduce a novel +semi-supervised framework to alleviate the dependency on densely annotated +data. Our approach leverages 2D foundation models to generate essential 3D +scene geometric and semantic cues, facilitating a more efficient training +process. Our framework exhibits notable properties: (1) Generalizability, +applicable to various 3D semantic scene completion approaches, including 2D-3D +lifting and 3D-2D transformer methods. (2) Effectiveness, as demonstrated +through experiments on SemanticKITTI and NYUv2, wherein our method achieves up +to 85% of the fully-supervised performance using only 10% labeled data. This +approach not only reduces the cost and labor associated with data annotation +but also demonstrates the potential for broader adoption in camera-based +systems for 3D semantic occupancy prediction. + +
+
+
+
+
+ + ♻ ☆ ScreenMark: Watermarking Arbitrary Visual Content on Screen + + +
+ Digital watermarking has demonstrated its effectiveness in protecting +multimedia content. However, existing watermarking are predominantly tailored +for specific media types, rendering them less effective for the protection of +content displayed on computer screens, which is often multimodal and dynamic. +Visual Screen Content (VSC), is particularly susceptible to theft and leakage +via screenshots, a vulnerability that current watermarking methods fail to +adequately address. To tackle these challenges, we propose ScreenMark, a robust +and practical watermarking method designed specifically for arbitrary VSC +protection. ScreenMark utilizes a three-stage progressive watermarking +framework. Initially, inspired by diffusion principles, we initialize the +mutual transformation between regular watermark information and irregular +watermark patterns. Subsequently, these patterns are integrated with screen +content using a pre-multiplication alpha blending technique, supported by a +pre-trained screen decoder for accurate watermark retrieval. The progressively +complex distorter enhances the robustness of the watermark in real-world +screenshot scenarios. Finally, the model undergoes fine-tuning guided by a +joint-level distorter to ensure optimal performance. To validate the +effectiveness of ScreenMark, we compiled a dataset comprising 100,000 +screenshots from various devices and resolutions. Extensive experiments across +different datasets confirm the method's superior robustness, imperceptibility, +and practical applicability. + +
+
+
+
+
+ + ♻ ☆ Wildfire Risk Prediction: A Review + + +
+ Wildfires have significant impacts on global vegetation, wildlife, and +humans. They destroy plant communities and wildlife habitats and contribute to +increased emissions of carbon dioxide, nitrogen oxides, methane, and other +pollutants. The prediction of wildfires relies on various independent variables +combined with regression or machine learning methods. In this technical review, +we describe the options for independent variables, data processing techniques, +models, independent variables collinearity and importance estimation methods, +and model performance evaluation metrics. First, we divide the independent +variables into 4 aspects, including climate and meteorology conditions, +socio-economical factors, terrain and hydrological features, and wildfire +historical records. Second, preprocessing methods are described for different +magnitudes, different spatial-temporal resolutions, and different formats of +data. Third, the collinearity and importance evaluation methods of independent +variables are also considered. Fourth, we discuss the application of +statistical models, traditional machine learning models, and deep learning +models in wildfire risk prediction. In this subsection, compared with other +reviews, this manuscript particularly discusses the evaluation metrics and +recent advancements in deep learning methods. Lastly, addressing the +limitations of current research, this paper emphasizes the need for more +effective deep learning time series forecasting algorithms, the utilization of +three-dimensional data including ground and trunk fuel, extraction of more +accurate historical fire point data, and improved model evaluation metrics. + +
+
+
+
+
+ + ♻ ☆ FlowDreamer: exploring high fidelity text-to-3D generation via rectified + flow + + +
+ Recent advances in text-to-3D generation have made significant progress. In +particular, with the pretrained diffusion models, existing methods +predominantly use Score Distillation Sampling (SDS) to train 3D models such as +Neural Radiance Fields (NeRF) and 3D Gaussian Splatting (3D GS). However, a +hurdle is that they often encounter difficulties with over-smoothing textures +and over-saturating colors. The rectified flow model - which utilizes a simple +ordinary differential equation (ODE) to represent a linear trajectory - shows +promise as an alternative prior to text-to-3D generation. It learns a +time-independent vector field, thereby reducing the ambiguity in 3D model +update gradients that are calculated using time-dependent scores in the SDS +framework. In light of this, we first develop a mathematical analysis to +seamlessly integrate SDS with rectified flow model, paving the way for our +initial framework known as Vector Field Distillation Sampling (VFDS). However, +empirical findings indicate that VFDS still results in over-smoothing outcomes. +Therefore, we analyze the grounding reasons for such a failure from the +perspective of ODE trajectories. On top, we propose a novel framework, named +FlowDreamer, which yields high-fidelity results with richer textual details and +faster convergence. The key insight is to leverage the coupling and reversible +properties of the rectified flow model to search for the corresponding noise, +rather than using randomly sampled noise as in VFDS. Accordingly, we introduce +a novel Unique Couple Matching (UCM) loss, which guides the 3D model to +optimize along the same trajectory. Our FlowDreamer is superior in its +flexibility to be applied to both NeRF and 3D GS. Extensive experiments +demonstrate the high-fidelity outcomes and accelerated convergence of +FlowDreamer. + +
+
+ comment: Tech Report +
+
+
+
+
+ + ♻ ☆ Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with + Texture Map-Based Weak Supervision + + +
+ Facial wrinkle detection plays a crucial role in cosmetic dermatology. +Precise manual segmentation of facial wrinkles is challenging and +time-consuming, with inherent subjectivity leading to inconsistent results +among graders. To address this issue, we propose two solutions. First, we build +and release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an +extension of the NVIDIA FFHQ dataset. It includes 1,000 images with human +labels and 50,000 images with automatically generated weak labels. This dataset +could serve as a foundation for the research community to develop advanced +wrinkle detection algorithms. Second, we introduce a simple training strategy +utilizing texture maps, applicable to various segmentation models, to detect +wrinkles across the face. Our two-stage training strategy first pretrain models +on a large dataset with weak labels (N=50k), or masked texture maps generated +through computer vision techniques, without human intervention. We then +finetune the models using human-labeled data (N=1k), which consists of manually +labeled wrinkle masks. The network takes as input a combination of RGB and +masked texture map of the image, comprising four channels, in finetuning. We +effectively combine labels from multiple annotators to minimize subjectivity in +manual labeling. Our strategies demonstrate improved segmentation performance +in facial wrinkle segmentation both quantitatively and visually compared to +existing pretraining methods. The dataset is available at +https://github.com/labhai/ffhq-wrinkle-dataset. + +
+
+
+
+
+ + ♻ ☆ MMCBE: Multi-modality Dataset for Crop Biomass Prediction and Beyond + + +
+ Crop biomass, a critical indicator of plant growth, health, and productivity, +is invaluable for crop breeding programs and agronomic research. However, the +accurate and scalable quantification of crop biomass remains inaccessible due +to limitations in existing measurement methods. One of the obstacles impeding +the advancement of current crop biomass prediction methodologies is the +scarcity of publicly available datasets. Addressing this gap, we introduce a +new dataset in this domain, i.e. Multi-modality dataset for crop biomass +estimation (MMCBE). Comprising 216 sets of multi-view drone images, coupled +with LiDAR point clouds, and hand-labelled ground truth, MMCBE represents the +first multi-modality one in the field. This dataset aims to establish benchmark +methods for crop biomass quantification and foster the development of +vision-based approaches. We have rigorously evaluated state-of-the-art crop +biomass estimation methods using MMCBE and ventured into additional potential +applications, such as 3D crop reconstruction from drone imagery and novel-view +rendering. With this publication, we are making our comprehensive dataset +available to the broader community. + +
+
+ comment: 10 pages, 10 figures, 3 tables +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ Contri(e)ve: Context + Retrieve for Scholarly Question Answering + + +
+ Scholarly communication is a rapid growing field containing a wealth of +knowledge. However, due to its unstructured and document format, it is +challenging to extract useful information from them through conventional +document retrieval methods. Scholarly knowledge graphs solve this problem, by +representing the documents in a semantic network, providing, hidden insights, +summaries and ease of accessibility through queries. Naturally, question +answering for scholarly graphs expands the accessibility to a wider audience. +But some of the knowledge in this domain is still presented as unstructured +text, thus requiring a hybrid solution for question answering systems. In this +paper, we present a two step solution using open source Large Language +Model(LLM): Llama3.1 for Scholarly-QALD dataset. Firstly, we extract the +context pertaining to the question from different structured and unstructured +data sources: DBLP, SemOpenAlex knowledge graphs and Wikipedia text. Secondly, +we implement prompt engineering to improve the information retrieval +performance of the LLM. Our approach achieved an F1 score of 40% and also +observed some anomalous responses from the LLM, that are discussed in the final +part of the paper. + +
+
+
+
+
+ + ☆ Comparative Analysis of Pretrained Audio Representations in Music + Recommender Systems + + +
+ Over the years, Music Information Retrieval (MIR) has proposed various models +pretrained on large amounts of music data. Transfer learning showcases the +proven effectiveness of pretrained backend models with a broad spectrum of +downstream tasks, including auto-tagging and genre classification. However, MIR +papers generally do not explore the efficiency of pretrained models for Music +Recommender Systems (MRS). In addition, the Recommender Systems community tends +to favour traditional end-to-end neural network learning over these models. Our +research addresses this gap and evaluates the applicability of six pretrained +backend models (MusicFM, Music2Vec, MERT, EncodecMAE, Jukebox, and MusiCNN) in +the context of MRS. We assess their performance using three recommendation +models: K-nearest neighbours (KNN), shallow neural network, and BERT4Rec. Our +findings suggest that pretrained audio representations exhibit significant +performance variability between traditional MIR tasks and MRS, indicating that +valuable aspects of musical information captured by backend models may differ +depending on the task. This study establishes a foundation for further +exploration of pretrained audio representations to enhance music recommendation +systems. + +
+
+
+
+
+ + ☆ Accurate and Fast Estimation of Temporal Motifs using Path Sampling ICDM'24 + + +
+ Counting the number of small subgraphs, called motifs, is a fundamental +problem in social network analysis and graph mining. Many real-world networks +are directed and temporal, where edges have timestamps. Motif counting in +directed, temporal graphs is especially challenging because there are a +plethora of different kinds of patterns. Temporal motif counts reveal much +richer information and there is a need for scalable algorithms for motif +counting. + A major challenge in counting is that there can be trillions of temporal +motif matches even with a graph with only millions of vertices. Both the motifs +and the input graphs can have multiple edges between two vertices, leading to a +combinatorial explosion problem. Counting temporal motifs involving just four +vertices is not feasible with current state-of-the-art algorithms. + We design an algorithm, TEACUPS, that addresses this problem using a novel +technique of temporal path sampling. We combine a path sampling method with +carefully designed temporal data structures, to propose an efficient +approximate algorithm for temporal motif counting. TEACUPS is an unbiased +estimator with provable concentration behavior, which can be used to bound the +estimation error. For a Bitcoin graph with hundreds of millions of edges, +TEACUPS runs in less than 1 minute, while the exact counting algorithm takes +more than a day. We empirically demonstrate the accuracy of TEACUPS on large +datasets, showing an average of 30$\times$ speedup (up to 2000$\times$ speedup) +compared to existing GPU-based exact counting methods while preserving high +count estimation accuracy. + +
+
+ comment: Accepted for ICDM'24 +
+
+
+
+
+ + ☆ Proactive Recommendation in Social Networks: Steering User Interest via + Neighbor Influence + + +
+ Recommending items solely catering to users' historical interests narrows +users' horizons. Recent works have considered steering target users beyond +their historical interests by directly adjusting items exposed to them. +However, the recommended items for direct steering might not align perfectly +with users' interests evolution, detrimentally affecting target users' +experience. To avoid this issue, we propose a new task named Proactive +Recommendation in Social Networks (PRSN) that indirectly steers users' interest +by utilizing the influence of social neighbors, i.e., indirect steering by +adjusting the exposure of a target item to target users' neighbors. The key to +PRSN lies in answering an interventional question: what would a target user's +feedback be on a target item if the item is exposed to the user's different +neighbors? To answer this question, we resort to causal inference and formalize +PRSN as: (1) estimating the potential feedback of a user on an item, under the +network interference by the item's exposure to the user's neighbors; and (2) +adjusting the exposure of a target item to target users' neighbors to trade off +steering performance and the damage to the neighbors' experience. To this end, +we propose a Neighbor Interference Recommendation (NIRec) framework with two +key modules: (1)an interference representation-based estimation module for +modeling potential feedback; and (2) a post-learning-based optimization module +for optimizing a target item's exposure to trade off steering performance and +the neighbors' experience by greedy search. We conduct extensive +semi-simulation experiments based on three real-world datasets, validating the +steering effectiveness of NIRec. + +
+
+
+
+
+ + ☆ LLM-based Weak Supervision Framework for Query Intent Classification in + Video Search + + +
+ Streaming services have reshaped how we discover and engage with digital +entertainment. Despite these advancements, effectively understanding the wide +spectrum of user search queries continues to pose a significant challenge. An +accurate query understanding system that can handle a variety of entities that +represent different user intents is essential for delivering an enhanced user +experience. We can build such a system by training a natural language +understanding (NLU) model; however, obtaining high-quality labeled training +data in this specialized domain is a substantial obstacle. Manual annotation is +costly and impractical for capturing users' vast vocabulary variations. To +address this, we introduce a novel approach that leverages large language +models (LLMs) through weak supervision to automatically annotate a vast +collection of user search queries. Using prompt engineering and a diverse set +of LLM personas, we generate training data that matches human annotator +expectations. By incorporating domain knowledge via Chain of Thought and +In-Context Learning, our approach leverages the labeled data to train +low-latency models optimized for real-time inference. Extensive evaluations +demonstrated that our approach outperformed the baseline with an average +relative gain of 113% in recall. Furthermore, our novel prompt engineering +framework yields higher quality LLM-generated data to be used for weak +supervision; we observed 47.60% improvement over baseline in agreement rate +between LLM predictions and human annotations with respect to F1 score, +weighted according to the distribution of occurrences of the search queries. +Our persona selection routing mechanism further adds an additional 3.67% +increase in weighted F1 score on top of our novel prompt engineering framework. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ NeSHFS: Neighborhood Search with Heuristic-based Feature Selection for + Click-Through Rate Prediction + + +
+ Click-through-rate (CTR) prediction plays an important role in online +advertising and ad recommender systems. In the past decade, maximizing CTR has +been the main focus of model development and solution creation. Therefore, +researchers and practitioners have proposed various models and solutions to +enhance the effectiveness of CTR prediction. Most of the existing literature +focuses on capturing either implicit or explicit feature interactions. Although +implicit interactions are successfully captured in some studies, explicit +interactions present a challenge for achieving high CTR by extracting both +low-order and high-order feature interactions. Unnecessary and irrelevant +features may cause high computational time and low prediction performance. +Furthermore, certain features may perform well with specific predictive models +while underperforming with others. Also, feature distribution may fluctuate due +to traffic variations. Most importantly, in live production environments, +resources are limited, and the time for inference is just as crucial as +training time. Because of all these reasons, feature selection is one of the +most important factors in enhancing CTR prediction model performance. Simple +filter-based feature selection algorithms do not perform well and they are not +sufficient. An effective and efficient feature selection algorithm is needed to +consistently filter the most useful features during live CTR prediction +process. In this paper, we propose a heuristic algorithm named Neighborhood +Search with Heuristic-based Feature Selection (NeSHFS) to enhance CTR +prediction performance while reducing dimensionality and training time costs. +We conduct comprehensive experiments on three public datasets to validate the +efficiency and effectiveness of our proposed solution. + +
+
+
+
+
+ + ☆ ATFLRec: A Multimodal Recommender System with Audio-Text Fusion and + Low-Rank Adaptation via Instruction-Tuned Large Language Model + + +
+ Recommender Systems (RS) play a pivotal role in boosting user satisfaction by +providing personalized product suggestions in domains such as e-commerce and +entertainment. This study examines the integration of multimodal data text and +audio into large language models (LLMs) with the aim of enhancing +recommendation performance. Traditional text and audio recommenders encounter +limitations such as the cold-start problem, and recent advancements in LLMs, +while promising, are computationally expensive. To address these issues, +Low-Rank Adaptation (LoRA) is introduced, which enhances efficiency without +compromising performance. The ATFLRec framework is proposed to integrate audio +and text modalities into a multimodal recommendation system, utilizing various +LoRA configurations and modality fusion techniques. Results indicate that +ATFLRec outperforms baseline models, including traditional and graph neural +network-based approaches, achieving higher AUC scores. Furthermore, separate +fine-tuning of audio and text data with distinct LoRA modules yields optimal +performance, with different pooling methods and Mel filter bank numbers +significantly impacting performance. This research offers valuable insights +into optimizing multimodal recommender systems and advancing the integration of +diverse data modalities in LLMs. + +
+
+
+
+
+ + ☆ Exploring Information Retrieval Landscapes: An Investigation of a Novel + Evaluation Techniques and Comparative Document Splitting Methods + + +
+ The performance of Retrieval-Augmented Generation (RAG) systems in +information retrieval is significantly influenced by the characteristics of the +documents being processed. In this study, the structured nature of textbooks, +the conciseness of articles, and the narrative complexity of novels are shown +to require distinct retrieval strategies. A comparative evaluation of multiple +document-splitting methods reveals that the Recursive Character Splitter +outperforms the Token-based Splitter in preserving contextual integrity. A +novel evaluation technique is introduced, utilizing an open-source model to +generate a comprehensive dataset of question-and-answer pairs, simulating +realistic retrieval scenarios to enhance testing efficiency and metric +reliability. The evaluation employs weighted scoring metrics, including +SequenceMatcher, BLEU, METEOR, and BERT Score, to assess the system's accuracy +and relevance. This approach establishes a refined standard for evaluating the +precision of RAG systems, with future research focusing on optimizing chunk and +overlap sizes to improve retrieval accuracy and efficiency. + +
+
+ comment: This article is 16 pages long and includes detailed comparisons of + RAG systems and document splitting techniques +
+
+
+
+
+ + ♻ ☆ Federated Transfer Learning Based Cooperative Wideband Spectrum Sensing + with Model Pruning + + +
+ For ultra-wideband and high-rate wireless communication systems, wideband +spectrum sensing (WSS) is critical, since it empowers secondary users (SUs) to +capture the spectrum holes for opportunistic transmission. However, WSS +encounters challenges such as excessive costs of hardware and computation due +to the high sampling rate, as well as robustness issues arising from scenario +mismatch. In this paper, a WSS neural network (WSSNet) is proposed by +exploiting multicoset preprocessing to enable the sub-Nyquist sampling, with +the two dimensional convolution design specifically tailored to work with the +preprocessed samples. A federated transfer learning (FTL) based framework +mobilizing multiple SUs is further developed to achieve a robust model +adaptable to various scenarios, which is paved by the selective weight pruning +for the fast model adaptation and inference. Simulation results demonstrate +that the proposed FTL-WSSNet achieves the fairly good performance in different +target scenarios even without local adaptation samples. + +
+
+
+
+
+ + ♻ ☆ RePlay: a Recommendation Framework for Experimentation and Production + Use + + +
+ Using a single tool to build and compare recommender systems significantly +reduces the time to market for new models. In addition, the comparison results +when using such tools look more consistent. This is why many different tools +and libraries for researchers in the field of recommendations have recently +appeared. Unfortunately, most of these frameworks are aimed primarily at +researchers and require modification for use in production due to the inability +to work on large datasets or an inappropriate architecture. In this demo, we +present our open-source toolkit RePlay - a framework containing an end-to-end +pipeline for building recommender systems, which is ready for production use. +RePlay also allows you to use a suitable stack for the pipeline on each stage: +Pandas, Polars, or Spark. This allows the library to scale computations and +deploy to a cluster. Thus, RePlay allows data scientists to easily move from +research mode to production mode using the same interfaces. + +
+
+
+
+
+ + ♻ ☆ STORE: Streamlining Semantic Tokenization and Generative Recommendation + with A Single LLM + + +
+ Traditional recommendation models often rely on unique item identifiers (IDs) +to distinguish between items, which can hinder their ability to effectively +leverage item content information and generalize to long-tail or cold-start +items. Recently, semantic tokenization has been proposed as a promising +solution that aims to tokenize each item's semantic representation into a +sequence of discrete tokens. In this way, it preserves the item's semantics +within these tokens and ensures that semantically similar items are represented +by similar tokens. These semantic tokens have become fundamental in training +generative recommendation models. However, existing generative recommendation +methods typically involve multiple sub-models for embedding, quantization, and +recommendation, leading to an overly complex system. In this paper, we propose +to streamline the semantic tokenization and generative recommendation process +with a unified framework, dubbed STORE, which leverages a single large language +model (LLM) for both tasks. Specifically, we formulate semantic tokenization as +a text-to-token task and generative recommendation as a token-to-token task, +supplemented by a token-to-text reconstruction task and a text-to-token +auxiliary task. All these tasks are framed in a generative manner and trained +using a single LLM backbone. Extensive experiments have been conducted to +validate the effectiveness of our STORE framework across various recommendation +tasks and datasets. We will release the source code and configurations for +reproducible research. + +
+
+
+
+
+
+
+
+ + Machine Learning 138 + +
+
+
+ + ☆ The unknotting number, hard unknot diagrams, and reinforcement learning + + +
+ We have developed a reinforcement learning agent that often finds a minimal +sequence of unknotting crossing changes for a knot diagram with up to 200 +crossings, hence giving an upper bound on the unknotting number. We have used +this to determine the unknotting number of 57k knots. We took diagrams of +connected sums of such knots with oppositely signed signatures, where the +summands were overlaid. The agent has found examples where several of the +crossing changes in an unknotting collection of crossings result in hyperbolic +knots. Based on this, we have shown that, given knots $K$ and $K'$ that satisfy +some mild assumptions, there is a diagram of their connected sum and $u(K) + +u(K')$ unknotting crossings such that changing any one of them results in a +prime knot. As a by-product, we have obtained a dataset of 2.6 million distinct +hard unknot diagrams; most of them under 35 crossings. Assuming the additivity +of the unknotting number, we have determined the unknotting number of 43 at +most 12-crossing knots for which the unknotting number is unknown. + +
+
+ comment: 29 pages, 17 figures +
+
+
+
+
+ + ☆ INN-PAR: Invertible Neural Network for PPG to ABP Reconstruction + + +
+ Non-invasive and continuous blood pressure (BP) monitoring is essential for +the early prevention of many cardiovascular diseases. Estimating arterial blood +pressure (ABP) from photoplethysmography (PPG) has emerged as a promising +solution. However, existing deep learning approaches for PPG-to-ABP +reconstruction (PAR) encounter certain information loss, impacting the +precision of the reconstructed signal. To overcome this limitation, we +introduce an invertible neural network for PPG to ABP reconstruction (INN-PAR), +which employs a series of invertible blocks to jointly learn the mapping +between PPG and its gradient with the ABP signal and its gradient. INN-PAR +efficiently captures both forward and inverse mappings simultaneously, thereby +preventing information loss. By integrating signal gradients into the learning +process, INN-PAR enhances the network's ability to capture essential +high-frequency details, leading to more accurate signal reconstruction. +Moreover, we propose a multi-scale convolution module (MSCM) within the +invertible block, enabling the model to learn features across multiple scales +effectively. We have experimented on two benchmark datasets, which show that +INN-PAR significantly outperforms the state-of-the-art methods in both waveform +reconstruction and BP measurement accuracy. + +
+
+
+
+
+ + ☆ An Efficient and Streaming Audio Visual Active Speaker Detection System + + +
+ This paper delves into the challenging task of Active Speaker Detection +(ASD), where the system needs to determine in real-time whether a person is +speaking or not in a series of video frames. While previous works have made +significant strides in improving network architectures and learning effective +representations for ASD, a critical gap exists in the exploration of real-time +system deployment. Existing models often suffer from high latency and memory +usage, rendering them impractical for immediate applications. To bridge this +gap, we present two scenarios that address the key challenges posed by +real-time constraints. First, we introduce a method to limit the number of +future context frames utilized by the ASD model. By doing so, we alleviate the +need for processing the entire sequence of future frames before a decision is +made, significantly reducing latency. Second, we propose a more stringent +constraint that limits the total number of past frames the model can access +during inference. This tackles the persistent memory issues associated with +running streaming ASD systems. Beyond these theoretical frameworks, we conduct +extensive experiments to validate our approach. Our results demonstrate that +constrained transformer models can achieve performance comparable to or even +better than state-of-the-art recurrent models, such as uni-directional GRUs, +with a significantly reduced number of context frames. Moreover, we shed light +on the temporal memory requirements of ASD systems, revealing that larger past +context has a more profound impact on accuracy than future context. When +profiling on a CPU we find that our efficient architecture is memory bound by +the amount of past context it can use and that the compute cost is negligible +as compared to the memory cost. + +
+
+
+
+
+ + ☆ VAE Explainer: Supplement Learning Variational Autoencoders with + Interactive Visualization + + +
+ Variational Autoencoders are widespread in Machine Learning, but are +typically explained with dense math notation or static code examples. This +paper presents VAE Explainer, an interactive Variational Autoencoder running in +the browser to supplement existing static documentation (e.g., Keras Code +Examples). VAE Explainer adds interactions to the VAE summary with interactive +model inputs, latent space, and output. VAE Explainer connects the high-level +understanding with the implementation: annotated code and a live computational +graph. The VAE Explainer interactive visualization is live at +https://xnought.github.io/vae-explainer and the code is open source at +https://github.com/xnought/vae-explainer. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ SGFormer: Single-Layer Graph Transformers with Approximation-Free Linear + Complexity NeurIPS2023 + + +
+ Learning representations on large graphs is a long-standing challenge due to +the inter-dependence nature. Transformers recently have shown promising +performance on small graphs thanks to its global attention for capturing +all-pair interactions beyond observed structures. Existing approaches tend to +inherit the spirit of Transformers in language and vision tasks, and embrace +complicated architectures by stacking deep attention-based propagation layers. +In this paper, we attempt to evaluate the necessity of adopting multi-layer +attentions in Transformers on graphs, which considerably restricts the +efficiency. Specifically, we analyze a generic hybrid propagation layer, +comprised of all-pair attention and graph-based propagation, and show that +multi-layer propagation can be reduced to one-layer propagation, with the same +capability for representation learning. It suggests a new technical path for +building powerful and efficient Transformers on graphs, particularly through +simplifying model architectures without sacrificing expressiveness. As +exemplified by this work, we propose a Simplified Single-layer Graph +Transformers (SGFormer), whose main component is a single-layer global +attention that scales linearly w.r.t. graph sizes and requires none of any +approximation for accommodating all-pair interactions. Empirically, SGFormer +successfully scales to the web-scale graph ogbn-papers100M, yielding +orders-of-magnitude inference acceleration over peer Transformers on +medium-sized graphs, and demonstrates competitiveness with limited labeled +data. + +
+
+ comment: Extended version of NeurIPS2023 contribution arXiv:2306.10759 +
+
+
+
+
+ + ☆ Model-independent variable selection via the rule-based variable priorit + + +
+ While achieving high prediction accuracy is a fundamental goal in machine +learning, an equally important task is finding a small number of features with +high explanatory power. One popular selection technique is permutation +importance, which assesses a variable's impact by measuring the change in +prediction error after permuting the variable. However, this can be problematic +due to the need to create artificial data, a problem shared by other methods as +well. Another problem is that variable selection methods can be limited by +being model-specific. We introduce a new model-independent approach, Variable +Priority (VarPro), which works by utilizing rules without the need to generate +artificial data or evaluate prediction error. The method is relatively easy to +use, requiring only the calculation of sample averages of simple statistics, +and can be applied to many data settings, including regression, classification, +and survival. We investigate the asymptotic properties of VarPro and show, +among other things, that VarPro has a consistent filtering property for noise +variables. Empirical studies using synthetic and real-world data show the +method achieves a balanced performance and compares favorably to many +state-of-the-art procedures currently used for variable selection. + +
+
+
+
+
+ + ☆ Biomimetic Frontend for Differentiable Audio Processing + + +
+ While models in audio and speech processing are becoming deeper and more +end-to-end, they as a consequence need expensive training on large data, and +are often brittle. We build on a classical model of human hearing and make it +differentiable, so that we can combine traditional explainable biomimetic +signal processing approaches with deep-learning frameworks. This allows us to +arrive at an expressive and explainable model that is easily trained on modest +amounts of data. We apply this model to audio processing tasks, including +classification and enhancement. Results show that our differentiable model +surpasses black-box approaches in terms of computational efficiency and +robustness, even with little training data. We also discuss other potential +applications. + +
+
+
+
+
+ + ☆ Clean Label Attacks against SLU Systems + + +
+ Poisoning backdoor attacks involve an adversary manipulating the training +data to induce certain behaviors in the victim model by inserting a trigger in +the signal at inference time. We adapted clean label backdoor (CLBD)-data +poisoning attacks, which do not modify the training labels, on state-of-the-art +speech recognition models that support/perform a Spoken Language Understanding +task, achieving 99.8% attack success rate by poisoning 10% of the training +data. We analyzed how varying the signal-strength of the poison, percent of +samples poisoned, and choice of trigger impact the attack. We also found that +CLBD attacks are most successful when applied to training samples that are +inherently hard for a proxy model. Using this strategy, we achieved an attack +success rate of 99.3% by poisoning a meager 1.5% of the training data. Finally, +we applied two previously developed defenses against gradient-based attacks, +and found that they attain mixed success against poisoning. + +
+
+ comment: Accepted at IEEE SLT 2024 +
+
+
+
+
+ + ☆ Predicting Trust In Autonomous Vehicles: Modeling Young Adult + Psychosocial Traits, Risk-Benefit Attitudes, And Driving Factors With Machine + Learning + + +
+ Low trust remains a significant barrier to Autonomous Vehicle (AV) adoption. +To design trustworthy AVs, we need to better understand the individual traits, +attitudes, and experiences that impact people's trust judgements. We use +machine learning to understand the most important factors that contribute to +young adult trust based on a comprehensive set of personal factors gathered via +survey (n = 1457). Factors ranged from psychosocial and cognitive attributes to +driving style, experiences, and perceived AV risks and benefits. Using the +explainable AI technique SHAP, we found that perceptions of AV risks and +benefits, attitudes toward feasibility and usability, institutional trust, +prior experience, and a person's mental model are the most important +predictors. Surprisingly, psychosocial and many technology- and +driving-specific factors were not strong predictors. Results highlight the +importance of individual differences for designing trustworthy AVs for diverse +groups and lead to key implications for future design and research. + +
+
+ comment: 31 pages (including references and appendix), 7 figures, 7 tables +
+
+
+
+
+ + ☆ PINNfluence: Influence Functions for Physics-Informed Neural Networks + + +
+ Recently, physics-informed neural networks (PINNs) have emerged as a flexible +and promising application of deep learning to partial differential equations in +the physical sciences. While offering strong performance and competitive +inference speeds on forward and inverse problems, their black-box nature limits +interpretability, particularly regarding alignment with expected physical +behavior. In the present work, we explore the application of influence +functions (IFs) to validate and debug PINNs post-hoc. Specifically, we apply +variations of IF-based indicators to gauge the influence of different types of +collocation points on the prediction of PINNs applied to a 2D Navier-Stokes +fluid flow problem. Our results demonstrate how IFs can be adapted to PINNs to +reveal the potential for further studies. + +
+
+
+
+
+ + ☆ A Bayesian Approach to Clustering via the Proper Bayesian Bootstrap: the + Bayesian Bagged Clustering (BBC) algorithm + + +
+ The paper presents a novel approach for unsupervised techniques in the field +of clustering. A new method is proposed to enhance existing literature models +using the proper Bayesian bootstrap to improve results in terms of robustness +and interpretability. Our approach is organized in two steps: k-means +clustering is used for prior elicitation, then proper Bayesian bootstrap is +applied as resampling method in an ensemble clustering approach. Results are +analyzed introducing measures of uncertainty based on Shannon entropy. The +proposal provides clear indication on the optimal number of clusters, as well +as a better representation of the clustered data. Empirical results are +provided on simulated data showing the methodological and empirical advances +obtained. + +
+
+
+
+
+ + ☆ DELTA: Dual Consistency Delving with Topological Uncertainty for Active + Graph Domain Adaptation + + +
+ Graph domain adaptation has recently enabled knowledge transfer across +different graphs. However, without the semantic information on target graphs, +the performance on target graphs is still far from satisfactory. To address the +issue, we study the problem of active graph domain adaptation, which selects a +small quantitative of informative nodes on the target graph for extra +annotation. This problem is highly challenging due to the complicated +topological relationships and the distribution discrepancy across graphs. In +this paper, we propose a novel approach named Dual Consistency Delving with +Topological Uncertainty (DELTA) for active graph domain adaptation. Our DELTA +consists of an edge-oriented graph subnetwork and a path-oriented graph +subnetwork, which can explore topological semantics from complementary +perspectives. In particular, our edge-oriented graph subnetwork utilizes the +message passing mechanism to learn neighborhood information, while our +path-oriented graph subnetwork explores high-order relationships from +substructures. To jointly learn from two subnetworks, we roughly select +informative candidate nodes with the consideration of consistency across two +subnetworks. Then, we aggregate local semantics from its K-hop subgraph based +on node degrees for topological uncertainty estimation. To overcome potential +distribution shifts, we compare target nodes and their corresponding source +nodes for discrepancy scores as an additional component for fine selection. +Extensive experiments on benchmark datasets demonstrate that DELTA outperforms +various state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Average-Reward Maximum Entropy Reinforcement Learning for Underactuated + Double Pendulum Tasks + + +
+ This report presents a solution for the swing-up and stabilisation tasks of +the acrobot and the pendubot, developed for the AI Olympics competition at IROS +2024. Our approach employs the Average-Reward Entropy Advantage Policy +Optimization (AR-EAPO), a model-free reinforcement learning (RL) algorithm that +combines average-reward RL and maximum entropy RL. Results demonstrate that our +controller achieves improved performance and robustness scores compared to +established baseline methods in both the acrobot and pendubot scenarios, +without the need for a heavily engineered reward function or system model. The +current results are applicable exclusively to the simulation stage setup. + +
+
+
+
+
+ + ☆ Optimization and Generalization Guarantees for Weight Normalization + + +
+ Weight normalization (WeightNorm) is widely used in practice for the training +of deep neural networks and modern deep learning libraries have built-in +implementations of it. In this paper, we provide the first theoretical +characterizations of both optimization and generalization of deep WeightNorm +models with smooth activation functions. For optimization, from the form of the +Hessian of the loss, we note that a small Hessian of the predictor leads to a +tractable analysis. Thus, we bound the spectral norm of the Hessian of +WeightNorm networks and show its dependence on the network width and weight +normalization terms--the latter being unique to networks without WeightNorm. +Then, we use this bound to establish training convergence guarantees under +suitable assumptions for gradient decent. For generalization, we use WeightNorm +to get a uniform convergence based generalization bound, which is independent +from the width and depends sublinearly on the depth. Finally, we present +experimental results which illustrate how the normalization terms and other +quantities of theoretical interest relate to the training of WeightNorm +networks. + +
+
+
+
+
+ + ☆ Multi forests: Variable importance for multi-class outcomes + + +
+ In prediction tasks with multi-class outcomes, identifying covariates +specifically associated with one or more outcome classes can be important. +Conventional variable importance measures (VIMs) from random forests (RFs), +like permutation and Gini importance, focus on overall predictive performance +or node purity, without differentiating between the classes. Therefore, they +can be expected to fail to distinguish class-associated covariates from +covariates that only distinguish between groups of classes. We introduce a VIM +called multi-class VIM, tailored for identifying exclusively class-associated +covariates, via a novel RF variant called multi forests (MuFs). The trees in +MuFs use both multi-way and binary splitting. The multi-way splits generate +child nodes for each class, using a split criterion that evaluates how well +these nodes represent their respective classes. This setup forms the basis of +the multi-class VIM, which measures the discriminatory ability of the splits +performed in the respective covariates with regard to this split criterion. +Alongside the multi-class VIM, we introduce a second VIM, the discriminatory +VIM. This measure, based on the binary splits, assesses the strength of the +general influence of the covariates, irrespective of their +class-associatedness. Simulation studies demonstrate that the multi-class VIM +specifically ranks class-associated covariates highly, unlike conventional VIMs +which also rank other types of covariates highly. Analyses of 121 datasets +reveal that MuFs often have slightly lower predictive performance compared to +conventional RFs. This is, however, not a limiting factor given the algorithm's +primary purpose of calculating the multi-class VIM. + +
+
+ comment: 30 pages, 6 figures +
+
+
+
+
+ + ☆ XSub: Explanation-Driven Adversarial Attack against Blackbox Classifiers + via Feature Substitution + + +
+ Despite its significant benefits in enhancing the transparency and +trustworthiness of artificial intelligence (AI) systems, explainable AI (XAI) +has yet to reach its full potential in real-world applications. One key +challenge is that XAI can unintentionally provide adversaries with insights +into black-box models, inevitably increasing their vulnerability to various +attacks. In this paper, we develop a novel explanation-driven adversarial +attack against black-box classifiers based on feature substitution, called +XSub. The key idea of XSub is to strategically replace important features +(identified via XAI) in the original sample with corresponding important +features from a "golden sample" of a different label, thereby increasing the +likelihood of the model misclassifying the perturbed sample. The degree of +feature substitution is adjustable, allowing us to control how much of the +original samples information is replaced. This flexibility effectively balances +a trade-off between the attacks effectiveness and its stealthiness. XSub is +also highly cost-effective in that the number of required queries to the +prediction model and the explanation model in conducting the attack is in O(1). +In addition, XSub can be easily extended to launch backdoor attacks in case the +attacker has access to the models training data. Our evaluation demonstrates +that XSub is not only effective and stealthy but also cost-effective, enabling +its application across a wide range of AI models. + +
+
+
+
+
+ + ☆ Latent Space Score-based Diffusion Model for Probabilistic Multivariate + Time Series Imputation + + +
+ Accurate imputation is essential for the reliability and success of +downstream tasks. Recently, diffusion models have attracted great attention in +this field. However, these models neglect the latent distribution in a +lower-dimensional space derived from the observed data, which limits the +generative capacity of the diffusion model. Additionally, dealing with the +original missing data without labels becomes particularly problematic. To +address these issues, we propose the Latent Space Score-Based Diffusion Model +(LSSDM) for probabilistic multivariate time series imputation. Observed values +are projected onto low-dimensional latent space and coarse values of the +missing data are reconstructed without knowing their ground truth values by +this unsupervised learning approach. Finally, the reconstructed values are fed +into a conditional diffusion model to obtain the precise imputed values of the +time series. In this way, LSSDM not only possesses the power to identify the +latent distribution but also seamlessly integrates the diffusion model to +obtain the high-fidelity imputed values and assess the uncertainty of the +dataset. Experimental results demonstrate that LSSDM achieves superior +imputation performance while also providing a better explanation and +uncertainty analysis of the imputation mechanism. The website of the code is +\textit{https://github.com/gorgen2020/LSSDM\_imputation}. + +
+
+ comment: 5 pages, conference +
+
+
+
+
+ + ☆ HLTCOE JHU Submission to the Voice Privacy Challenge 2024 + + +
+ We present a number of systems for the Voice Privacy Challenge, including +voice conversion based systems such as the kNN-VC method and the WavLM voice +Conversion method, and text-to-speech (TTS) based systems including +Whisper-VITS. We found that while voice conversion systems better preserve +emotional content, they struggle to conceal speaker identity in semi-white-box +attack scenarios; conversely, TTS methods perform better at anonymization and +worse at emotion preservation. Finally, we propose a random admixture system +which seeks to balance out the strengths and weaknesses of the two category of +systems, achieving a strong EER of over 40% while maintaining UAR at a +respectable 47%. + +
+
+ comment: Submission to the Voice Privacy Challenge 2024. Accepted and + presented at +
+
+
+
+
+ + ☆ AnyBipe: An End-to-End Framework for Training and Deploying Bipedal + Robots Guided by Large Language Models + + +
+ Training and deploying reinforcement learning (RL) policies for robots, +especially in accomplishing specific tasks, presents substantial challenges. +Recent advancements have explored diverse reward function designs, training +techniques, simulation-to-reality (sim-to-real) transfers, and performance +analysis methodologies, yet these still require significant human intervention. +This paper introduces an end-to-end framework for training and deploying RL +policies, guided by Large Language Models (LLMs), and evaluates its +effectiveness on bipedal robots. The framework consists of three interconnected +modules: an LLM-guided reward function design module, an RL training module +leveraging prior work, and a sim-to-real homomorphic evaluation module. This +design significantly reduces the need for human input by utilizing only +essential simulation and deployment platforms, with the option to incorporate +human-engineered strategies and historical data. We detail the construction of +these modules, their advantages over traditional approaches, and demonstrate +the framework's capability to autonomously develop and refine controlling +strategies for bipedal robot locomotion, showcasing its potential to operate +independently of human intervention. + +
+
+
+
+
+ + ☆ Detect Fake with Fake: Leveraging Synthetic Data-driven Representation + for Synthetic Image Detection ECCV 2024 + + +
+ Are general-purpose visual representations acquired solely from synthetic +data useful for detecting fake images? In this work, we show the effectiveness +of synthetic data-driven representations for synthetic image detection. Upon +analysis, we find that vision transformers trained by the latest visual +representation learners with synthetic data can effectively distinguish fake +from real images without seeing any real images during pre-training. Notably, +using SynCLR as the backbone in a state-of-the-art detection method +demonstrates a performance improvement of +10.32 mAP and +4.73% accuracy over +the widely used CLIP, when tested on previously unseen GAN models. Code is +available at https://github.com/cvpaperchallenge/detect-fake-with-fake. + +
+
+ comment: Accepted to TWYN workshop at ECCV 2024 +
+
+
+
+
+ + ☆ Exploring Graph Structure Comprehension Ability of Multimodal Large + Language Models: Case Studies + + +
+ Large Language Models (LLMs) have shown remarkable capabilities in processing +various data structures, including graphs. While previous research has focused +on developing textual encoding methods for graph representation, the emergence +of multimodal LLMs presents a new frontier for graph comprehension. These +advanced models, capable of processing both text and images, offer potential +improvements in graph understanding by incorporating visual representations +alongside traditional textual data. This study investigates the impact of graph +visualisations on LLM performance across a range of benchmark tasks at node, +edge, and graph levels. Our experiments compare the effectiveness of multimodal +approaches against purely textual graph representations. The results provide +valuable insights into both the potential and limitations of leveraging visual +graph modalities to enhance LLMs' graph structure comprehension abilities. + +
+
+
+
+
+ + ☆ Adjoint Matching: Fine-tuning Flow and Diffusion Generative Models with + Memoryless Stochastic Optimal Control + + +
+ Dynamical generative models that produce samples through an iterative +process, such as Flow Matching and denoising diffusion models, have seen +widespread use, but there has not been many theoretically-sound methods for +improving these models with reward fine-tuning. In this work, we cast reward +fine-tuning as stochastic optimal control (SOC). Critically, we prove that a +very specific memoryless noise schedule must be enforced during fine-tuning, in +order to account for the dependency between the noise variable and the +generated samples. We also propose a new algorithm named Adjoint Matching which +outperforms existing SOC algorithms, by casting SOC problems as a regression +problem. We find that our approach significantly improves over existing methods +for reward fine-tuning, achieving better consistency, realism, and +generalization to unseen human preference reward models, while retaining sample +diversity. + +
+
+
+
+
+ + ☆ Kinect Calibration and Data Optimization For Anthropometric Parameters + + +
+ Recently, through development of several 3d vision systems, widely used in +various applications, medical and biometric fields. Microsoft kinect sensor +have been most of used camera among 3d vision systems. Microsoft kinect sensor +can obtain depth images of a scene and 3d coordinates of human joints. Thus, +anthropometric features can extractable easily. Anthropometric feature and 3d +joint coordinate raw datas which captured from kinect sensor is unstable. The +strongest reason for this, datas vary by distance between joints of individual +and location of kinect sensor. Consequently, usage of this datas without kinect +calibration and data optimization does not result in sufficient and healthy. In +this study, proposed a novel method to calibrating kinect sensor and optimizing +skeleton features. Results indicate that the proposed method is quite effective +and worthy of further study in more general scenarios. + +
+
+
+
+
+ + ☆ FP-VEC: Fingerprinting Large Language Models via Efficient Vector + Addition + + +
+ Training Large Language Models (LLMs) requires immense computational power +and vast amounts of data. As a result, protecting the intellectual property of +these models through fingerprinting is essential for ownership authentication. +While adding fingerprints to LLMs through fine-tuning has been attempted, it +remains costly and unscalable. In this paper, we introduce FP-VEC, a pilot +study on using fingerprint vectors as an efficient fingerprinting method for +LLMs. Our approach generates a fingerprint vector that represents a +confidential signature embedded in the model, allowing the same fingerprint to +be seamlessly incorporated into an unlimited number of LLMs via vector +addition. Results on several LLMs show that FP-VEC is lightweight by running on +CPU-only devices for fingerprinting, scalable with a single training and +unlimited fingerprinting process, and preserves the model's normal behavior. +The project page is available at https://fingerprintvector.github.io . + +
+
+
+
+
+ + ☆ RF Challenge: The Data-Driven Radio Frequency Signal Separation + Challenge + + +
+ This paper addresses the critical problem of interference rejection in +radio-frequency (RF) signals using a novel, data-driven approach that leverages +state-of-the-art AI models. Traditionally, interference rejection algorithms +are manually tailored to specific types of interference. This work introduces a +more scalable data-driven solution and contains the following contributions. +First, we present an insightful signal model that serves as a foundation for +developing and analyzing interference rejection algorithms. Second, we +introduce the RF Challenge, a publicly available dataset featuring diverse RF +signals along with code templates, which facilitates data-driven analysis of RF +signal problems. Third, we propose novel AI-based rejection algorithms, +specifically architectures like UNet and WaveNet, and evaluate their +performance across eight different signal mixture types. These models +demonstrate superior performance exceeding traditional methods like matched +filtering and linear minimum mean square error estimation by up to two orders +of magnitude in bit-error rate. Fourth, we summarize the results from an open +competition hosted at 2024 IEEE International Conference on Acoustics, Speech, +and Signal Processing (ICASSP 2024) based on the RF Challenge, highlighting the +significant potential for continued advancements in this area. Our findings +underscore the promise of deep learning algorithms in mitigating interference, +offering a strong foundation for future research. + +
+
+ comment: 14 pages, 12 figures, submitted to the IEEE Open Journal of the + Communications Society +
+
+
+
+
+ + ☆ Can Kans (re)discover predictive models for Direct-Drive Laser Fusion? + + +
+ The domain of laser fusion presents a unique and challenging predictive +modeling application landscape for machine learning methods due to high problem +complexity and limited training data. Data-driven approaches utilizing +prescribed functional forms, inductive biases and physics-informed learning +(PIL) schemes have been successful in the past for achieving desired +generalization ability and model interpretation that aligns with physics +expectations. In complex multi-physics application domains, however, it is not +always obvious how architectural biases or discriminative penalties can be +formulated. In this work, focusing on nuclear fusion energy using high powered +lasers, we present the use of Kolmogorov-Arnold Networks (KANs) as an +alternative to PIL for developing a new type of data-driven predictive model +which is able to achieve high prediction accuracy and physics interpretability. +A KAN based model, a MLP with PIL, and a baseline MLP model are compared in +generalization ability and interpretation with a domain expert-derived symbolic +regression model. Through empirical studies in this high physics complexity +domain, we show that KANs can potentially provide benefits when developing +predictive models for data-starved physics applications. + +
+
+
+
+
+ + ☆ AutoIRT: Calibrating Item Response Theory Models with Automated Machine + Learning + + +
+ Item response theory (IRT) is a class of interpretable factor models that are +widely used in computerized adaptive tests (CATs), such as language proficiency +tests. Traditionally, these are fit using parametric mixed effects models on +the probability of a test taker getting the correct answer to a test item +(i.e., question). Neural net extensions of these models, such as BertIRT, +require specialized architectures and parameter tuning. We propose a multistage +fitting procedure that is compatible with out-of-the-box Automated Machine +Learning (AutoML) tools. It is based on a Monte Carlo EM (MCEM) outer loop with +a two stage inner loop, which trains a non-parametric AutoML grade model using +item features followed by an item specific parametric model. This greatly +accelerates the modeling workflow for scoring tests. We demonstrate its +effectiveness by applying it to the Duolingo English Test, a high stakes, +online English proficiency test. We show that the resulting model is typically +more well calibrated, gets better predictive performance, and more accurate +scores than existing methods (non-explanatory IRT models and explanatory IRT +models like BERT-IRT). Along the way, we provide a brief survey of machine +learning methods for calibration of item parameters for CATs. + +
+
+
+
+
+ + ☆ TabKANet: Tabular Data Modelling with Kolmogorov-Arnold Network and + Transformer + + +
+ Tabular data is the most common type of data in real-life scenarios. In this +study, we propose a method based on the TabKANet architecture, which utilizes +the Kolmogorov-Arnold network to encode numerical features and merge them with +categorical features, enabling unified modeling of tabular data on the +Transformer architecture. This model demonstrates outstanding performance in +six widely used binary classification tasks, suggesting that TabKANet has the +potential to become a standard approach for tabular modeling, surpassing +traditional neural networks. Furthermore, this research reveals the significant +advantages of the Kolmogorov-Arnold network in encoding numerical features. The +code of our work is available at https://github.com/tsinghuamedgao20/TabKANet. + +
+
+
+
+
+ + ☆ Electrocardiogram Report Generation and Question Answering via + Retrieval-Augmented Self-Supervised Modeling + + +
+ Interpreting electrocardiograms (ECGs) and generating comprehensive reports +remain challenging tasks in cardiology, often requiring specialized expertise +and significant time investment. To address these critical issues, we propose +ECG-ReGen, a retrieval-based approach for ECG-to-text report generation and +question answering. Our method leverages a self-supervised learning for the ECG +encoder, enabling efficient similarity searches and report retrieval. By +combining pre-training with dynamic retrieval and Large Language Model +(LLM)-based refinement, ECG-ReGen effectively analyzes ECG data and answers +related queries, with the potential of improving patient care. Experiments +conducted on the PTB-XL and MIMIC-IV-ECG datasets demonstrate superior +performance in both in-domain and cross-domain scenarios for report generation. +Furthermore, our approach exhibits competitive performance on ECG-QA dataset +compared to fully supervised methods when utilizing off-the-shelf LLMs for +zero-shot question answering. This approach, effectively combining +self-supervised encoder and LLMs, offers a scalable and efficient solution for +accurate ECG interpretation, holding significant potential to enhance clinical +decision-making. + +
+
+
+
+
+ + ☆ Deep Learning-based Codes for Wiretap Fading Channels + + +
+ The wiretap channel is a well-studied problem in the physical layer security +(PLS) literature. Although it is proven that the decoding error probability and +information leakage can be made arbitrarily small in the asymptotic regime, +further research on finite-blocklength codes is required on the path towards +practical, secure communications systems. This work provides the first +experimental characterization of a deep learning-based, finite-blocklength code +construction for multi-tap fading wiretap channels without channel state +information (CSI). In addition to the evaluation of the average probability of +error and information leakage, we illustrate the influence of (i) the number of +fading taps, (ii) differing variances of the fading coefficients and (iii) the +seed selection for the hash function-based security layer. + +
+
+
+
+
+ + ☆ In-depth Analysis of Low-rank Matrix Factorisation in a Federated + Setting + + +
+ We analyze a distributed algorithm to compute a low-rank matrix factorization +on $N$ clients, each holding a local dataset $\mathbf{S}^i \in \mathbb{R}^{n_i +\times d}$, mathematically, we seek to solve $min_{\mathbf{U}^i \in +\mathbb{R}^{n_i\times r}, \mathbf{V}\in \mathbb{R}^{d \times r} } \frac{1}{2} +\sum_{i=1}^N \|\mathbf{S}^i - \mathbf{U}^i \mathbf{V}^\top\|^2_{\text{F}}$. +Considering a power initialization of $\mathbf{V}$, we rewrite the previous +smooth non-convex problem into a smooth strongly-convex problem that we solve +using a parallel Nesterov gradient descent potentially requiring a single step +of communication at the initialization step. For any client $i$ in $\{1, \dots, +N\}$, we obtain a global $\mathbf{V}$ in $\mathbb{R}^{d \times r}$ common to +all clients and a local variable $\mathbf{U}^i$ in $\mathbb{R}^{n_i \times r}$. +We provide a linear rate of convergence of the excess loss which depends on +$\sigma_{\max} / \sigma_{r}$, where $\sigma_{r}$ is the $r^{\mathrm{th}}$ +singular value of the concatenation $\mathbf{S}$ of the matrices +$(\mathbf{S}^i)_{i=1}^N$. This result improves the rates of convergence given +in the literature, which depend on $\sigma_{\max}^2 / \sigma_{\min}^2$. We +provide an upper bound on the Frobenius-norm error of reconstruction under the +power initialization strategy. We complete our analysis with experiments on +both synthetic and real data. + +
+
+
+
+
+ + ☆ Increasing Both Batch Size and Learning Rate Accelerates Stochastic + Gradient Descent + + +
+ The performance of mini-batch stochastic gradient descent (SGD) strongly +depends on setting the batch size and learning rate to minimize the empirical +loss in training the deep neural network. In this paper, we present theoretical +analyses of mini-batch SGD with four schedulers: (i) constant batch size and +decaying learning rate scheduler, (ii) increasing batch size and decaying +learning rate scheduler, (iii) increasing batch size and increasing learning +rate scheduler, and (iv) increasing batch size and warm-up decaying learning +rate scheduler. We show that mini-batch SGD using scheduler (i) does not always +minimize the expectation of the full gradient norm of the empirical loss, +whereas it does using any of schedulers (ii), (iii), and (iv). Furthermore, +schedulers (iii) and (iv) accelerate mini-batch SGD. The paper also provides +numerical results of supporting analyses showing that using scheduler (iii) or +(iv) minimizes the full gradient norm of the empirical loss faster than using +scheduler (i) or (ii). + +
+
+ comment: 23 pages, 5 figures +
+
+
+
+
+ + ☆ Measure-Theoretic Time-Delay Embedding + + +
+ The celebrated Takens' embedding theorem provides a theoretical foundation +for reconstructing the full state of a dynamical system from partial +observations. However, the classical theorem assumes that the underlying system +is deterministic and that observations are noise-free, limiting its +applicability in real-world scenarios. Motivated by these limitations, we +rigorously establish a measure-theoretic generalization that adopts an Eulerian +description of the dynamics and recasts the embedding as a pushforward map +between probability spaces. Our mathematical results leverage recent advances +in optimal transportation theory. Building on our novel measure-theoretic +time-delay embedding theory, we have developed a new computational framework +that forecasts the full state of a dynamical system from time-lagged partial +observations, engineered with better robustness to handle sparse and noisy +data. We showcase the efficacy and versatility of our approach through several +numerical examples, ranging from the classic Lorenz-63 system to large-scale, +real-world applications such as NOAA sea surface temperature forecasting and +ERA5 wind field reconstruction. + +
+
+ comment: 32 pages, 8 figures +
+
+
+
+
+ + ☆ SAUC: Sparsity-Aware Uncertainty Calibration for Spatiotemporal + Prediction with Graph Neural Networks SP + + +
+ Quantifying uncertainty is crucial for robust and reliable predictions. +However, existing spatiotemporal deep learning mostly focuses on deterministic +prediction, overlooking the inherent uncertainty in such prediction. +Particularly, highly-granular spatiotemporal datasets are often sparse, posing +extra challenges in prediction and uncertainty quantification. To address these +issues, this paper introduces a novel post-hoc Sparsity-awar Uncertainty +Calibration (SAUC) framework, which calibrates uncertainty in both zero and +non-zero values. To develop SAUC, we firstly modify the state-of-the-art +deterministic spatiotemporal Graph Neural Networks (ST-GNNs) to probabilistic +ones in the pre-calibration phase. Then we calibrate the probabilistic ST-GNNs +for zero and non-zero values using quantile approaches.Through extensive +experiments, we demonstrate that SAUC can effectively fit the variance of +sparse data and generalize across two real-world spatiotemporal datasets at +various granularities. Specifically, our empirical experiments show a 20\% +reduction in calibration errors in zero entries on the sparse traffic accident +and urban crime prediction. Overall, this work demonstrates the theoretical and +empirical values of the SAUC framework, thus bridging a significant gap between +uncertainty quantification and spatiotemporal prediction. + +
+
+ comment: Paper accepted by ACM SIGSPATIAL 2024 +
+
+
+
+
+ + ☆ Energy Consumption Trends in Sound Event Detection Systems + + +
+ Deep learning systems have become increasingly energy- and +computation-intensive, raising concerns about their environmental impact. As +organizers of the Detection and Classification of Acoustic Scenes and Events +(DCASE) challenge, we recognize the importance of addressing this issue. For +the past three years, we have integrated energy consumption metrics into the +evaluation of sound event detection (SED) systems. In this paper, we analyze +the impact of this energy criterion on the challenge results and explore the +evolution of system complexity and energy consumption over the years. We +highlight a shift towards more energy-efficient approaches during training +without compromising performance, while the number of operations and system +complexity continue to grow. Through this analysis, we hope to promote more +environmentally friendly practices within the SED community. + +
+
+
+
+
+ + ☆ Online Network Inference from Graph-Stationary Signals with Hidden Nodes + + +
+ Graph learning is the fundamental task of estimating unknown graph +connectivity from available data. Typical approaches assume that not only is +all information available simultaneously but also that all nodes can be +observed. However, in many real-world scenarios, data can neither be known +completely nor obtained all at once. We present a novel method for online graph +estimation that accounts for the presence of hidden nodes. We consider signals +that are stationary on the underlying graph, which provides a model for the +unknown connections to hidden nodes. We then formulate a convex optimization +problem for graph learning from streaming, incomplete graph signals. We solve +the proposed problem through an efficient proximal gradient algorithm that can +run in real-time as data arrives sequentially. Additionally, we provide +theoretical conditions under which our online algorithm is similar to +batch-wise solutions. Through experimental results on synthetic and real-world +data, we demonstrate the viability of our approach for online graph learning in +the presence of missing observations. + +
+
+
+
+
+ + ☆ Uncertainty Estimation by Density Aware Evidential Deep Learning ICML 2024 + + +
+ Evidential deep learning (EDL) has shown remarkable success in uncertainty +estimation. However, there is still room for improvement, particularly in +out-of-distribution (OOD) detection and classification tasks. The limited OOD +detection performance of EDL arises from its inability to reflect the distance +between the testing example and training data when quantifying uncertainty, +while its limited classification performance stems from its parameterization of +the concentration parameters. To address these limitations, we propose a novel +method called Density Aware Evidential Deep Learning (DAEDL). DAEDL integrates +the feature space density of the testing example with the output of EDL during +the prediction stage, while using a novel parameterization that resolves the +issues in the conventional parameterization. We prove that DAEDL enjoys a +number of favorable theoretical properties. DAEDL demonstrates state-of-the-art +performance across diverse downstream tasks related to uncertainty estimation +and classification + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ☆ A Hybrid Meta-Learning and Multi-Armed Bandit Approach for + Context-Specific Multi-Objective Recommendation Optimization + + +
+ Recommender systems in online marketplaces face the challenge of balancing +multiple objectives to satisfy various stakeholders, including customers, +providers, and the platform itself. This paper introduces Juggler-MAB, a hybrid +approach that combines meta-learning with Multi-Armed Bandits (MAB) to address +the limitations of existing multi-stakeholder recommendation systems. Our +method extends the Juggler framework, which uses meta-learning to predict +optimal weights for utility and compensation adjustments, by incorporating a +MAB component for real-time, context-specific refinements. We present a +two-stage approach where Juggler provides initial weight predictions, followed +by MAB-based adjustments that adapt to rapid changes in user behavior and +market conditions. Our system leverages contextual features such as device type +and brand to make fine-grained weight adjustments based on specific segments. +To evaluate our approach, we developed a simulation framework using a dataset +of 0.6 million searches from Expedia's lodging booking platform. Results show +that Juggler-MAB outperforms the original Juggler model across all metrics, +with NDCG improvements of 2.9%, a 13.7% reduction in regret, and a 9.8% +improvement in best arm selection rate. + +
+
+
+
+
+ + ☆ Uncertainty and Generalizability in Foundation Models for Earth + Observation + + +
+ We take the perspective in which we want to design a downstream task (such as +estimating vegetation coverage) on a certain area of interest (AOI) with a +limited labeling budget. By leveraging an existing Foundation Model (FM) we +must decide whether we train a downstream model on a different but label-rich +AOI hoping it generalizes to our AOI, or we split labels in our AOI for +training and validating. In either case, we face choices concerning what FM to +use, how to sample our AOI for labeling, etc. which affect both the performance +and uncertainty of the results. In this work, we perform a large ablative study +using eight existing FMs on either Sentinel 1 or Sentinel 2 as input data, and +the classes from the ESA World Cover product as downstream tasks across eleven +AOIs. We do repeated sampling and training, resulting in an ablation of some +500K simple linear regression models. Our results show both the limits of +spatial generalizability across AOIs and the power of FMs where we are able to +get over 0.9 correlation coefficient between predictions and targets on +different chip level predictive tasks. And still, performance and uncertainty +vary greatly across AOIs, tasks and FMs. We believe this is a key issue in +practice, because there are many design decisions behind each FM and downstream +task (input modalities, sampling, architectures, pretraining, etc.) and usually +a downstream task designer is aware of and can decide upon a few of them. +Through this work, we advocate for the usage of the methodology herein +described (large ablations on reference global labels and simple probes), both +when publishing new FMs, and to make informed decisions when designing +downstream tasks to use them. + +
+
+ comment: A large ablation study measuring uncertainty and spatial + generalizability with 8 foundation models, 11 world regions and 7 downstream + tasks +
+
+
+
+
+ + ☆ Adaptive Sampling for Continuous Group Equivariant Neural Networks ICML 2024 + + +
+ Steerable networks, which process data with intrinsic symmetries, often use +Fourier-based nonlinearities that require sampling from the entire group, +leading to a need for discretization in continuous groups. As the number of +samples increases, both performance and equivariance improve, yet this also +leads to higher computational costs. To address this, we introduce an adaptive +sampling approach that dynamically adjusts the sampling process to the +symmetries in the data, reducing the number of required group samples and +lowering the computational demands. We explore various implementations and +their effects on model performance, equivariance, and computational efficiency. +Our findings demonstrate improved model performance, and a marginal increase in +memory efficiency. + +
+
+ comment: 9 pages, published in the Geometry-grounded Representation Learning + and Generative Modeling (GRaM) Workshop at ICML 2024 +
+
+
+
+
+ + ☆ Multi-intent Aware Contrastive Learning for Sequential Recommendation + + +
+ Intent is a significant latent factor influencing user-item interaction +sequences. Prevalent sequence recommendation models that utilize contrastive +learning predominantly rely on single-intent representations to direct the +training process. However, this paradigm oversimplifies real-world +recommendation scenarios, attempting to encapsulate the diversity of intents +within the single-intent level representation. SR models considering +multi-intent information in their framework are more likely to reflect +real-life recommendation scenarios accurately. + +
+
+
+
+
+ + ☆ Bridging Dynamic Factor Models and Neural Controlled Differential + Equations for Nowcasting GDP CIKM 2024 + + +
+ Gross domestic product (GDP) nowcasting is crucial for policy-making as GDP +growth is a key indicator of economic conditions. Dynamic factor models (DFMs) +have been widely adopted by government agencies for GDP nowcasting due to their +ability to handle irregular or missing macroeconomic indicators and their +interpretability. However, DFMs face two main challenges: i) the lack of +capturing economic uncertainties such as sudden recessions or booms, and ii) +the limitation of capturing irregular dynamics from mixed-frequency data. To +address these challenges, we introduce NCDENow, a novel GDP nowcasting +framework that integrates neural controlled differential equations (NCDEs) with +DFMs. This integration effectively handles the dynamics of irregular time +series. NCDENow consists of 3 main modules: i) factor extraction leveraging +DFM, ii) dynamic modeling using NCDE, and iii) GDP growth prediction through +regression. We evaluate NCDENow against 6 baselines on 2 real-world GDP +datasets from South Korea and the United Kingdom, demonstrating its enhanced +predictive capability. Our empirical results favor our method, highlighting the +significant potential of integrating NCDE into nowcasting models. Our code and +dataset are available at https://github.com/sklim84/NCDENow_CIKM2024. + +
+
+ comment: Accepted at CIKM 2024. Seonkyu Lim and Jeongwhan Choi are co-first + authors with equal contributions +
+
+
+
+
+ + ☆ Disentangling the sources of cyber risk premia + + +
+ We use a methodology based on a machine learning algorithm to quantify firms' +cyber risks based on their disclosures and a dedicated cyber corpus. The model +can identify paragraphs related to determined cyber-threat types and +accordingly attribute several related cyber scores to the firm. The cyber +scores are unrelated to other firms' characteristics. Stocks with high cyber +scores significantly outperform other stocks. The long-short cyber risk factors +have positive risk premia, are robust to all factors' benchmarks, and help +price returns. Furthermore, we suggest the market does not distinguish between +different types of cyber risks but instead views them as a single, aggregate +cyber risk. + +
+
+
+
+
+ + ☆ Quasimetric Value Functions with Dense Rewards + + +
+ As a generalization of reinforcement learning (RL) to parametrizable goals, +goal conditioned RL (GCRL) has a broad range of applications, particularly in +challenging tasks in robotics. Recent work has established that the optimal +value function of GCRL $Q^\ast(s,a,g)$ has a quasimetric structure, leading to +targetted neural architectures that respect such structure. However, the +relevant analyses assume a sparse reward setting -- a known aggravating factor +to sample complexity. We show that the key property underpinning a quasimetric, +viz., the triangle inequality, is preserved under a dense reward setting as +well. Contrary to earlier findings where dense rewards were shown to be +detrimental to GCRL, we identify the key condition necessary for triangle +inequality. Dense reward functions that satisfy this condition can only +improve, never worsen, sample complexity. This opens up opportunities to train +efficient neural architectures with dense rewards, compounding their benefits +to sample complexity. We evaluate this proposal in 12 standard benchmark +environments in GCRL featuring challenging continuous control tasks. Our +empirical results confirm that training a quasimetric value function in our +dense reward setting indeed outperforms training with sparse rewards. + +
+
+
+
+
+ + ☆ Layerwise Change of Knowledge in Neural Networks + + +
+ This paper aims to explain how a deep neural network (DNN) gradually extracts +new knowledge and forgets noisy features through layers in forward propagation. +Up to now, although the definition of knowledge encoded by the DNN has not +reached a consensus, Previous studies have derived a series of mathematical +evidence to take interactions as symbolic primitive inference patterns encoded +by a DNN. We extend the definition of interactions and, for the first time, +extract interactions encoded by intermediate layers. We quantify and track the +newly emerged interactions and the forgotten interactions in each layer during +the forward propagation, which shed new light on the learning behavior of DNNs. +The layer-wise change of interactions also reveals the change of the +generalization capacity and instability of feature representations of a DNN. + +
+
+
+
+
+ + ☆ L3Cube-IndicQuest: A Benchmark Questing Answering Dataset for Evaluating + Knowledge of LLMs in Indic Context + + +
+ Large Language Models (LLMs) have made significant progress in incorporating +Indic languages within multilingual models. However, it is crucial to +quantitatively assess whether these languages perform comparably to globally +dominant ones, such as English. Currently, there is a lack of benchmark +datasets specifically designed to evaluate the regional knowledge of LLMs in +various Indic languages. In this paper, we present the L3Cube-IndicQuest, a +gold-standard question-answering benchmark dataset designed to evaluate how +well multilingual LLMs capture regional knowledge across various Indic +languages. The dataset contains 200 question-answer pairs, each for English and +19 Indic languages, covering five domains specific to the Indic region. We aim +for this dataset to serve as a benchmark, providing ground truth for evaluating +the performance of LLMs in understanding and representing knowledge relevant to +the Indian context. The IndicQuest can be used for both reference-based +evaluation and LLM-as-a-judge evaluation. The dataset is shared publicly at +https://github.com/l3cube-pune/indic-nlp . + +
+
+
+
+
+ + ☆ Personalized Weight Loss Management through Wearable Devices and + Artificial Intelligence + + +
+ Early detection of chronic and Non-Communicable Diseases (NCDs) is crucial +for effective treatment during the initial stages. This study explores the +application of wearable devices and Artificial Intelligence (AI) in order to +predict weight loss changes in overweight and obese individuals. Using wearable +data from a 1-month trial involving around 100 subjects from the AI4FoodDB +database, including biomarkers, vital signs, and behavioral data, we identify +key differences between those achieving weight loss (>= 2% of their initial +weight) and those who do not. Feature selection techniques and classification +algorithms reveal promising results, with the Gradient Boosting classifier +achieving 84.44% Area Under the Curve (AUC). The integration of multiple data +sources (e.g., vital signs, physical and sleep activity, etc.) enhances +performance, suggesting the potential of wearable devices and AI in +personalized healthcare. + +
+
+ comment: 15 pages, 5 figures, 6 tables, 1 appendix +
+
+
+
+
+ + ☆ Precision Aquaculture: An Integrated Computer Vision and IoT Approach + for Optimized Tilapia Feeding + + +
+ Traditional fish farming practices often lead to inefficient feeding, +resulting in environmental issues and reduced productivity. We developed an +innovative system combining computer vision and IoT technologies for precise +Tilapia feeding. Our solution uses real-time IoT sensors to monitor water +quality parameters and computer vision algorithms to analyze fish size and +count, determining optimal feed amounts. A mobile app enables remote monitoring +and control. We utilized YOLOv8 for keypoint detection to measure Tilapia +weight from length, achieving \textbf{94\%} precision on 3,500 annotated +images. Pixel-based measurements were converted to centimeters using depth +estimation for accurate feeding calculations. Our method, with data collection +mirroring inference conditions, significantly improved results. Preliminary +estimates suggest this approach could increase production up to 58 times +compared to traditional farms. Our models, code, and dataset are +open-source~\footnote{The code, dataset, and models are available upon +reasonable request. + +
+
+ comment: 8 pages, 6 figures, 3 tables, 21th International Conference on + Informatics in Control, Automation, and Robotics +
+
+
+
+
+ + ☆ xTED: Cross-Domain Policy Adaptation via Diffusion-Based Trajectory + Editing + + +
+ Reusing pre-collected data from different domains is an attractive solution +in decision-making tasks where the accessible data is insufficient in the +target domain but relatively abundant in other related domains. Existing +cross-domain policy transfer methods mostly aim at learning domain +correspondences or corrections to facilitate policy learning, which requires +learning domain/task-specific model components, representations, or policies +that are inflexible or not fully reusable to accommodate arbitrary domains and +tasks. These issues make us wonder: can we directly bridge the domain gap at +the data (trajectory) level, instead of devising complicated, domain-specific +policy transfer models? In this study, we propose a Cross-Domain Trajectory +EDiting (xTED) framework with a new diffusion transformer model (Decision +Diffusion Transformer, DDiT) that captures the trajectory distribution from the +target dataset as a prior. The proposed diffusion transformer backbone captures +the intricate dependencies among state, action, and reward sequences, as well +as the transition dynamics within the target data trajectories. With the above +pre-trained diffusion prior, source data trajectories with domain gaps can be +transformed into edited trajectories that closely resemble the target data +distribution through the diffusion-based editing process, which implicitly +corrects the underlying domain gaps, enhancing the state realism and dynamics +reliability in source trajectory data, while enabling flexible choices of +downstream policy learning methods. Despite its simplicity, xTED demonstrates +superior performance against other baselines in extensive simulation and +real-robot experiments. + +
+
+ comment: xTED offers a novel, generic, flexible, simple and effective paradigm + that casts cross-domain policy adaptation as a data pre-processing problem +
+
+
+
+
+ + ☆ Redesigning graph filter-based GNNs to relax the homophily assumption + + +
+ Graph neural networks (GNNs) have become a workhorse approach for learning +from data defined over irregular domains, typically by implicitly assuming that +the data structure is represented by a homophilic graph. However, recent works +have revealed that many relevant applications involve heterophilic data where +the performance of GNNs can be notably compromised. To address this challenge, +we present a simple yet effective architecture designed to mitigate the +limitations of the homophily assumption. The proposed architecture reinterprets +the role of graph filters in convolutional GNNs, resulting in a more general +architecture while incorporating a stronger inductive bias than GNNs based on +filter banks. The proposed convolutional layer enhances the expressive capacity +of the architecture enabling it to learn from both homophilic and heterophilic +data and preventing the issue of oversmoothing. From a theoretical standpoint, +we show that the proposed architecture is permutation equivariant. Finally, we +show that the proposed GNNs compares favorably relative to several +state-of-the-art baselines in both homophilic and heterophilic datasets, +showcasing its promising potential. + +
+
+
+
+
+ + ☆ Acoustic identification of individual animals with hierarchical + contrastive learning ICASSP 2025 + + +
+ Acoustic identification of individual animals (AIID) is closely related to +audio-based species classification but requires a finer level of detail to +distinguish between individual animals within the same species. In this work, +we frame AIID as a hierarchical multi-label classification task and propose the +use of hierarchy-aware loss functions to learn robust representations of +individual identities that maintain the hierarchical relationships among +species and taxa. Our results demonstrate that hierarchical embeddings not only +enhance identification accuracy at the individual level but also at higher +taxonomic levels, effectively preserving the hierarchical structure in the +learned representations. By comparing our approach with non-hierarchical +models, we highlight the advantage of enforcing this structure in the embedding +space. Additionally, we extend the evaluation to the classification of novel +individual classes, demonstrating the potential of our method in open-set +classification scenarios. + +
+
+ comment: Under review; Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Towards certifiable AI in aviation: landscape, challenges, and + opportunities + + +
+ Artificial Intelligence (AI) methods are powerful tools for various domains, +including critical fields such as avionics, where certification is required to +achieve and maintain an acceptable level of safety. General solutions for +safety-critical systems must address three main questions: Is it suitable? What +drives the system's decisions? Is it robust to errors/attacks? This is more +complex in AI than in traditional methods. In this context, this paper presents +a comprehensive mind map of formal AI certification in avionics. It highlights +the challenges of certifying AI development with an example to emphasize the +need for qualification beyond performance metrics. + +
+
+
+
+
+ + ☆ Investigating Disentanglement in a Phoneme-level Speech Codec for + Prosody Modeling + + +
+ Most of the prevalent approaches in speech prosody modeling rely on learning +global style representations in a continuous latent space which encode and +transfer the attributes of reference speech. However, recent work on neural +codecs which are based on Residual Vector Quantization (RVQ) already shows +great potential offering distinct advantages. We investigate the prosody +modeling capabilities of the discrete space of such an RVQ-VAE model, modifying +it to operate on the phoneme-level. We condition both the encoder and decoder +of the model on linguistic representations and apply a global speaker embedding +in order to factor out both phonetic and speaker information. We conduct an +extensive set of investigations based on subjective experiments and objective +measures to show that the phoneme-level discrete latent representations +obtained this way achieves a high degree of disentanglement, capturing +fine-grained prosodic information that is robust and transferable. The latent +space turns out to have interpretable structure with its principal components +corresponding to pitch and energy. + +
+
+
+
+
+ + ☆ Online Learning Of Expanding Graphs + + +
+ This paper addresses the problem of online network topology inference for +expanding graphs from a stream of spatiotemporal signals. Online algorithms for +dynamic graph learning are crucial in delay-sensitive applications or when +changes in topology occur rapidly. While existing works focus on inferring the +connectivity within a fixed set of nodes, in practice, the graph can grow as +new nodes join the network. This poses additional challenges like modeling +temporal dynamics involving signals and graphs of different sizes. This growth +also increases the computational complexity of the learning process, which may +become prohibitive. To the best of our knowledge, this is the first work to +tackle this setting. We propose a general online algorithm based on projected +proximal gradient descent that accounts for the increasing graph size at each +iteration. Recursively updating the sample covariance matrix is a key aspect of +our approach. We introduce a strategy that enables different types of updates +for nodes that just joined the network and for previously existing nodes. To +provide further insights into the proposed method, we specialize it in Gaussian +Markov random field settings, where we analyze the computational complexity and +characterize the dynamic cumulative regret. Finally, we demonstrate the +effectiveness of the proposed approach using both controlled experiments and +real-world datasets from epidemic and financial networks. + +
+
+
+
+
+ + ☆ Promoting Fairness in Link Prediction with Graph Enhancement + + +
+ Link prediction is a crucial task in network analysis, but it has been shown +to be prone to biased predictions, particularly when links are unfairly +predicted between nodes from different sensitive groups. In this paper, we +study the fair link prediction problem, which aims to ensure that the predicted +link probability is independent of the sensitive attributes of the connected +nodes. Existing methods typically incorporate debiasing techniques within graph +embeddings to mitigate this issue. However, training on large real-world graphs +is already challenging, and adding fairness constraints can further complicate +the process. To overcome this challenge, we propose FairLink, a method that +learns a fairness-enhanced graph to bypass the need for debiasing during the +link predictor's training. FairLink maintains link prediction accuracy by +ensuring that the enhanced graph follows a training trajectory similar to that +of the original input graph. Meanwhile, it enhances fairness by minimizing the +absolute difference in link probabilities between node pairs within the same +sensitive group and those between node pairs from different sensitive groups. +Our extensive experiments on multiple large-scale graphs demonstrate that +FairLink not only promotes fairness but also often achieves link prediction +accuracy comparable to baseline methods. Most importantly, the enhanced graph +exhibits strong generalizability across different GNN architectures. + +
+
+
+
+
+ + ☆ LMAC-TD: Producing Time Domain Explanations for Audio Classifiers + + +
+ Neural networks are typically black-boxes that remain opaque with regards to +their decision mechanisms. Several works in the literature have proposed +post-hoc explanation methods to alleviate this issue. This paper proposes +LMAC-TD, a post-hoc explanation method that trains a decoder to produce +explanations directly in the time domain. This methodology builds upon the +foundation of L-MAC, Listenable Maps for Audio Classifiers, a method that +produces faithful and listenable explanations. We incorporate SepFormer, a +popular transformer-based time-domain source separation architecture. We show +through a user study that LMAC-TD significantly improves the audio quality of +the produced explanations while not sacrificing from faithfulness. + +
+
+ comment: The first two authors contributed equally to this research. Author + order is alphabetical +
+
+
+
+
+ + ☆ Training Gradient Boosted Decision Trees on Tabular Data Containing + Label Noise for Classification Tasks + + +
+ Label noise refers to the phenomenon where instances in a data set are +assigned to the wrong label. Label noise is harmful to classifier performance, +increases model complexity and impairs feature selection. Addressing label +noise is crucial, yet current research primarily focuses on image and text data +using deep neural networks. This leaves a gap in the study of tabular data and +gradient-boosted decision trees (GBDTs), the leading algorithm for tabular +data. Different methods have already been developed which either try to filter +label noise, model label noise while simultaneously training a classifier or +use learning algorithms which remain effective even if label noise is present. +This study aims to further investigate the effects of label noise on +gradient-boosted decision trees and methods to mitigate those effects. Through +comprehensive experiments and analysis, the implemented methods demonstrate +state-of-the-art noise detection performance on the Adult dataset and achieve +the highest classification precision and recall on the Adult and Breast Cancer +datasets, respectively. In summary, this paper enhances the understanding of +the impact of label noise on GBDTs and lays the groundwork for future research +in noise detection and correction methods. + +
+
+
+
+
+ + ☆ CPL: Critical Planning Step Learning Boosts LLM Generalization in + Reasoning Tasks + + +
+ Post-training large language models (LLMs) to develop reasoning capabilities +has proven effective across diverse domains, such as mathematical reasoning and +code generation. However, existing methods primarily focus on improving +task-specific reasoning but have not adequately addressed the model's +generalization capabilities across a broader range of reasoning tasks. To +tackle this challenge, we introduce Critical Planning Step Learning (CPL), +which leverages Monte Carlo Tree Search (MCTS) to explore diverse planning +steps in multi-step reasoning tasks. Based on long-term outcomes, CPL learns +step-level planning preferences to improve the model's planning capabilities +and, consequently, its general reasoning capabilities. Furthermore, while +effective in many scenarios for aligning LLMs, existing preference learning +approaches like Direct Preference Optimization (DPO) struggle with complex +multi-step reasoning tasks due to their inability to capture fine-grained +supervision at each step. We propose Step-level Advantage Preference +Optimization (Step-APO), which integrates an advantage estimate for step-level +preference pairs obtained via MCTS into the DPO. This enables the model to more +effectively learn critical intermediate planning steps, thereby further +improving its generalization in reasoning tasks. Experimental results +demonstrate that our method, trained exclusively on GSM8K and MATH, not only +significantly improves performance on GSM8K (+10.5%) and MATH (+6.5%), but also +enhances out-of-domain reasoning benchmarks, such as ARC-C (+4.0%), BBH +(+1.8%), MMLU-STEM (+2.2%), and MMLU (+0.9%). + +
+
+
+
+
+ + ☆ Byzantine-Robust and Communication-Efficient Distributed Learning via + Compressed Momentum Filtering + + +
+ Distributed learning has become the standard approach for training +large-scale machine learning models across private data silos. While +distributed learning enhances privacy preservation and training efficiency, it +faces critical challenges related to Byzantine robustness and communication +reduction. Existing Byzantine-robust and communication-efficient methods rely +on full gradient information either at every iteration or at certain iterations +with a probability, and they only converge to an unnecessarily large +neighborhood around the solution. Motivated by these issues, we propose a novel +Byzantine-robust and communication-efficient stochastic distributed learning +method that imposes no requirements on batch size and converges to a smaller +neighborhood around the optimal solution than all existing methods, aligning +with the theoretical lower bound. Our key innovation is leveraging Polyak +Momentum to mitigate the noise caused by both biased compressors and stochastic +gradients, thus defending against Byzantine workers under information +compression. We provide proof of tight complexity bounds for our algorithm in +the context of non-convex smooth loss functions, demonstrating that these +bounds match the lower bounds in Byzantine-free scenarios. Finally, we validate +the practical significance of our algorithm through an extensive series of +experiments, benchmarking its performance on both binary classification and +image classification tasks. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Utilizing Data Fingerprints for Privacy-Preserving Algorithm Selection + in Time Series Classification: Performance and Uncertainty Estimation on + Unseen Datasets + + +
+ The selection of algorithms is a crucial step in designing AI services for +real-world time series classification use cases. Traditional methods such as +neural architecture search, automated machine learning, combined algorithm +selection, and hyperparameter optimizations are effective but require +considerable computational resources and necessitate access to all data points +to run their optimizations. In this work, we introduce a novel data fingerprint +that describes any time series classification dataset in a privacy-preserving +manner and provides insight into the algorithm selection problem without +requiring training on the (unseen) dataset. By decomposing the multi-target +regression problem, only our data fingerprints are used to estimate algorithm +performance and uncertainty in a scalable and adaptable manner. Our approach is +evaluated on the 112 University of California riverside benchmark datasets, +demonstrating its effectiveness in predicting the performance of 35 +state-of-the-art algorithms and providing valuable insights for effective +algorithm selection in time series classification service systems, improving a +naive baseline by 7.32% on average in estimating the mean performance and +15.81% in estimating the uncertainty. + +
+
+ comment: Hawaii International Conference on System Sciences (HICSS-58) 2025 +
+
+
+
+
+ + ☆ Improving Analog Neural Network Robustness: A Noise-Agnostic Approach + with Explainable Regularizations + + +
+ This work tackles the critical challenge of mitigating "hardware noise" in +deep analog neural networks, a major obstacle in advancing analog signal +processing devices. We propose a comprehensive, hardware-agnostic solution to +address both correlated and uncorrelated noise affecting the activation layers +of deep neural models. The novelty of our approach lies in its ability to +demystify the "black box" nature of noise-resilient networks by revealing the +underlying mechanisms that reduce sensitivity to noise. In doing so, we +introduce a new explainable regularization framework that harnesses these +mechanisms to significantly enhance noise robustness in deep neural +architectures. + +
+
+
+
+
+ + ☆ Co-Optimization of Robot Design and Control: Enhancing Performance and + Understanding Design Complexity + + +
+ The design (shape) of a robot is usually decided before the control is +implemented. This might limit how well the design is adapted to a task, as the +suitability of the design is given by how well the robot performs in the task, +which requires both a design and a controller. The co-optimization or +simultaneous optimization of the design and control of robots addresses this +limitation by producing a design and control that are both adapted to the task. +In this paper, we investigate some of the challenges inherent in the +co-optimization of design and control. We show that retraining the controller +of a robot with additional resources after the co-optimization process +terminates significantly improves the robot's performance. In addition, we +demonstrate that the resources allocated to training the controller for each +design influence the design complexity, where simpler designs are associated +with lower training budgets. The experimentation is conducted in four publicly +available simulation environments for co-optimization of design and control, +making the findings more applicable to the general case. The results presented +in this paper hope to guide other practitioners in the co-optimization of +design and control of robots. + +
+
+
+
+
+ + ☆ Optimizing Item-based Marketing Promotion Efficiency in C2C Marketplace + with Dynamic Sequential Coupon Allocation Framework + + +
+ In e-commerce platforms, coupons play a crucial role in boosting +transactions. In the customer-to-customer (C2C) marketplace, ensuring the +satisfaction of both buyers and sellers is essential. While buyer-focused +marketing strategies often receive more attention, addressing the needs of +sellers is equally important. Additionally, the existing strategies tend to +optimize each promotion independently, resulting in a lack of continuity +between promotions and unnecessary costs in the pursuit of short-term impact +within each promotion period. + We introduce a Dynamic Sequential Coupon Allocation Framework (DSCAF) to +optimize item coupon allocation strategies across a series of promotions. DSCAF +provides sequential recommendations for coupon configurations and timing to +target items. In cases where initial suggestions do not lead to sales, it +dynamically adjusts the strategy and offers subsequent solutions. It integrates +two predictors for estimating the sale propensity in the current and subsequent +rounds of coupon allocation, and a decision-making process to determine the +coupon allocation solution. It runs iteratively until the item is sold. The +goal of the framework is to maximize Return on Investment (ROI) while ensuring +lift Sell-through Rate (STR) remains above a specified threshold. DSCAF aims to +optimize sequential coupon efficiency with a long-term perspective rather than +solely focusing on the lift achieved in each individual promotion. It has been +applied for item coupon allocation in Mercari. + +
+
+
+
+
+ + ☆ Automatic Generation of Fast and Accurate Performance Models for Deep + Neural Network Accelerators + + +
+ Implementing Deep Neural Networks (DNNs) on resource-constrained edge devices +is a challenging task that requires tailored hardware accelerator architectures +and a clear understanding of their performance characteristics when executing +the intended AI workload. To facilitate this, we present an automated +generation approach for fast performance models to accurately estimate the +latency of a DNN mapped onto systematically modeled and concisely described +accelerator architectures. Using our accelerator architecture description +method, we modeled representative DNN accelerators such as Gemmini, UltraTrail, +Plasticine-derived, and a parameterizable systolic array. Together with DNN +mappings for those modeled architectures, we perform a combined DNN/hardware +dependency graph analysis, which enables us, in the best case, to evaluate only +154 loop kernel iterations to estimate the performance for 4.19 billion +instructions achieving a significant speedup. We outperform regression and +analytical models in terms of mean absolute percentage error (MAPE) compared to +simulation results, while being several magnitudes faster than an RTL +simulation. + +
+
+ comment: Accepted version for: ACM Transactions on Embedded Computing Systems +
+
+
+
+
+ + ☆ CompressedMediQ: Hybrid Quantum Machine Learning Pipeline for + High-Dimentional Neuroimaging Data + + +
+ This paper introduces CompressedMediQ, a novel hybrid quantum-classical +machine learning pipeline specifically developed to address the computational +challenges associated with high-dimensional multi-class neuroimaging data +analysis. Standard neuroimaging datasets, such as 4D MRI data from the +Alzheimer's Disease Neuroimaging Initiative (ADNI) and Neuroimaging in +Frontotemporal Dementia (NIFD), present significant hurdles due to their vast +size and complexity. CompressedMediQ integrates classical high-performance +computing (HPC) nodes for advanced MRI pre-processing and Convolutional Neural +Network (CNN)-PCA-based feature extraction and reduction, addressing the +limited-qubit availability for quantum data encoding in the NISQ (Noisy +Intermediate-Scale Quantum) era. This is followed by Quantum Support Vector +Machine (QSVM) classification. By utilizing quantum kernel methods, the +pipeline optimizes feature mapping and classification, enhancing data +separability and outperforming traditional neuroimaging analysis techniques. +Experimental results highlight the pipeline's superior accuracy in dementia +staging, validating the practical use of quantum machine learning in clinical +diagnostics. Despite the limitations of NISQ devices, this proof-of-concept +demonstrates the transformative potential of quantum-enhanced learning, paving +the way for scalable and precise diagnostic tools in healthcare and signal +processing. + +
+
+
+
+
+ + ☆ Learning Short Codes for Fading Channels with No or Receiver-Only + Channel State Information + + +
+ In next-generation wireless networks, low latency often necessitates +short-length codewords that either do not use channel state information (CSI) +or rely solely on CSI at the receiver (CSIR). Gaussian codes that achieve +capacity for AWGN channels may be unsuitable for these no-CSI and CSIR-only +cases. In this work, we design short-length codewords for these cases using an +autoencoder architecture. From the designed codes, we observe the following: In +the no-CSI case, the learned codes are mutually orthogonal when the +distribution of the real and imaginary parts of the fading random variable has +support over the entire real line. However, when the support is limited to the +non-negative real line, the codes are not mutually orthogonal. For the +CSIR-only case, deep learning-based codes designed for AWGN channels perform +worse in fading channels with optimal coherent detection compared to codes +specifically designed for fading channels with CSIR, where the autoencoder +jointly learns encoding, coherent combining, and decoding. In both no-CSI and +CSIR-only cases, the codes perform at least as well as or better than classical +codes of the same block length. + +
+
+
+
+
+ + ☆ Molecular Graph Representation Learning via Structural Similarity + Information + + +
+ Graph Neural Networks (GNNs) have been widely employed for feature +representation learning in molecular graphs. Therefore, it is crucial to +enhance the expressiveness of feature representation to ensure the +effectiveness of GNNs. However, a significant portion of current research +primarily focuses on the structural features within individual molecules, often +overlooking the structural similarity between molecules, which is a crucial +aspect encapsulating rich information on the relationship between molecular +properties and structural characteristics. Thus, these approaches fail to +capture the rich semantic information at the molecular structure level. To +bridge this gap, we introduce the \textbf{Molecular Structural Similarity Motif +GNN (MSSM-GNN)}, a novel molecular graph representation learning method that +can capture structural similarity information among molecules from a global +perspective. In particular, we propose a specially designed graph that +leverages graph kernel algorithms to represent the similarity between molecules +quantitatively. Subsequently, we employ GNNs to learn feature representations +from molecular graphs, aiming to enhance the accuracy of property prediction by +incorporating additional molecular representation information. Finally, through +a series of experiments conducted on both small-scale and large-scale molecular +datasets, we demonstrate that our model consistently outperforms eleven +state-of-the-art baselines. The codes are available at +https://github.com/yaoyao-yaoyao-cell/MSSM-GNN. + +
+
+
+
+
+ + ☆ Batch Ensemble for Variance Dependent Regret in Stochastic Bandits + + +
+ Efficiently trading off exploration and exploitation is one of the key +challenges in online Reinforcement Learning (RL). Most works achieve this by +carefully estimating the model uncertainty and following the so-called +optimistic model. Inspired by practical ensemble methods, in this work we +propose a simple and novel batch ensemble scheme that provably achieves +near-optimal regret for stochastic Multi-Armed Bandits (MAB). Crucially, our +algorithm has just a single parameter, namely the number of batches, and its +value does not depend on distributional properties such as the scale and +variance of the losses. We complement our theoretical results by demonstrating +the effectiveness of our algorithm on synthetic benchmarks. + +
+
+
+
+
+ + ☆ Second-order difference subspace + + +
+ Subspace representation is a fundamental technique in various fields of +machine learning. Analyzing a geometrical relationship among multiple subspaces +is essential for understanding subspace series' temporal and/or spatial +dynamics. This paper proposes the second-order difference subspace, a +higher-order extension of the first-order difference subspace between two +subspaces that can analyze the geometrical difference between them. As a +preliminary for that, we extend the definition of the first-order difference +subspace to the more general setting that two subspaces with different +dimensions have an intersection. We then define the second-order difference +subspace by combining the concept of first-order difference subspace and +principal component subspace (Karcher mean) between two subspaces, motivated by +the second-order central difference method. We can understand that the +first/second-order difference subspaces correspond to the velocity and +acceleration of subspace dynamics from the viewpoint of a geodesic on a +Grassmann manifold. We demonstrate the validity and naturalness of our +second-order difference subspace by showing numerical results on two +applications: temporal shape analysis of a 3D object and time series analysis +of a biometric signal. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ Fair CoVariance Neural Networks + + +
+ Covariance-based data processing is widespread across signal processing and +machine learning applications due to its ability to model data +interconnectivities and dependencies. However, harmful biases in the data may +become encoded in the sample covariance matrix and cause data-driven methods to +treat different subpopulations unfairly. Existing works such as fair principal +component analysis (PCA) mitigate these effects, but remain unstable in low +sample regimes, which in turn may jeopardize the fairness goal. To address both +biases and instability, we propose Fair coVariance Neural Networks (FVNNs), +which perform graph convolutions on the covariance matrix for both fair and +accurate predictions. Our FVNNs provide a flexible model compatible with +several existing bias mitigation techniques. In particular, FVNNs allow for +mitigating the bias in two ways: first, they operate on fair covariance +estimates that remove biases from their principal components; second, they are +trained in an end-to-end fashion via a fairness regularizer in the loss +function so that the model parameters are tailored to solve the task directly +in a fair manner. We prove that FVNNs are intrinsically fairer than analogous +PCA approaches thanks to their stability in low sample regimes. We validate the +robustness and fairness of our model on synthetic and real-world data, +showcasing the flexibility of FVNNs along with the tradeoff between fair and +accurate performance. + +
+
+
+
+
+ + ☆ Think Twice Before You Act: Improving Inverse Problem Solving With MCMC + + +
+ Recent studies demonstrate that diffusion models can serve as a strong prior +for solving inverse problems. A prominent example is Diffusion Posterior +Sampling (DPS), which approximates the posterior distribution of data given the +measure using Tweedie's formula. Despite the merits of being versatile in +solving various inverse problems without re-training, the performance of DPS is +hindered by the fact that this posterior approximation can be inaccurate +especially for high noise levels. Therefore, we propose \textbf{D}iffusion +\textbf{P}osterior \textbf{MC}MC (\textbf{DPMC}), a novel inference algorithm +based on Annealed MCMC to solve inverse problems with pretrained diffusion +models. We define a series of intermediate distributions inspired by the +approximated conditional distributions used by DPS. Through annealed MCMC +sampling, we encourage the samples to follow each intermediate distribution +more closely before moving to the next distribution at a lower noise level, and +therefore reduce the accumulated error along the path. We test our algorithm in +various inverse problems, including super resolution, Gaussian deblurring, +motion deblurring, inpainting, and phase retrieval. Our algorithm outperforms +DPS with less number of evaluations across nearly all tasks, and is competitive +among existing approaches. + +
+
+
+
+
+ + ☆ Causal GNNs: A GNN-Driven Instrumental Variable Approach for Causal + Inference in Networks + + +
+ As network data applications continue to expand, causal inference within +networks has garnered increasing attention. However, hidden confounders +complicate the estimation of causal effects. Most methods rely on the strong +ignorability assumption, which presumes the absence of hidden confounders-an +assumption that is both difficult to validate and often unrealistic in +practice. To address this issue, we propose CgNN, a novel approach that +leverages network structure as instrumental variables (IVs), combined with +graph neural networks (GNNs) and attention mechanisms, to mitigate hidden +confounder bias and improve causal effect estimation. By utilizing network +structure as IVs, we reduce confounder bias while preserving the correlation +with treatment. Our integration of attention mechanisms enhances robustness and +improves the identification of important nodes. Validated on two real-world +datasets, our results demonstrate that CgNN effectively mitigates hidden +confounder bias and offers a robust GNN-driven IV framework for causal +inference in complex network data. + +
+
+
+
+
+ + ☆ An Efficient Privacy-aware Split Learning Framework for Satellite + Communications + + +
+ In the rapidly evolving domain of satellite communications, integrating +advanced machine learning techniques, particularly split learning, is crucial +for enhancing data processing and model training efficiency across satellites, +space stations, and ground stations. Traditional ML approaches often face +significant challenges within satellite networks due to constraints such as +limited bandwidth and computational resources. To address this gap, we propose +a novel framework for more efficient SL in satellite communications. Our +approach, Dynamic Topology Informed Pruning, namely DTIP, combines differential +privacy with graph and model pruning to optimize graph neural networks for +distributed learning. DTIP strategically applies differential privacy to raw +graph data and prunes GNNs, thereby optimizing both model size and +communication load across network tiers. Extensive experiments across diverse +datasets demonstrate DTIP's efficacy in enhancing privacy, accuracy, and +computational efficiency. Specifically, on Amazon2M dataset, DTIP maintains an +accuracy of 0.82 while achieving a 50% reduction in floating-point operations +per second. Similarly, on ArXiv dataset, DTIP achieves an accuracy of 0.85 +under comparable conditions. Our framework not only significantly improves the +operational efficiency of satellite communications but also establishes a new +benchmark in privacy-aware distributed learning, potentially revolutionizing +data handling in space-based networks. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Integration of Mamba and Transformer -- MAT for Long-Short Range Time + Series Forecasting with Application to Weather Dynamics CEC + + +
+ Long-short range time series forecasting is essential for predicting future +trends and patterns over extended periods. While deep learning models such as +Transformers have made significant strides in advancing time series +forecasting, they often encounter difficulties in capturing long-term +dependencies and effectively managing sparse semantic features. The state-space +model, Mamba, addresses these issues through its adept handling of selective +input and parallel computing, striking a balance between computational +efficiency and prediction accuracy. This article examines the advantages and +disadvantages of both Mamba and Transformer models, and introduces a combined +approach, MAT, which leverages the strengths of each model to capture unique +long-short range dependencies and inherent evolutionary patterns in +multivariate time series. Specifically, MAT harnesses the long-range dependency +capabilities of Mamba and the short-range characteristics of Transformers. +Experimental results on benchmark weather datasets demonstrate that MAT +outperforms existing comparable methods in terms of prediction accuracy, +scalability, and memory efficiency. + +
+
+ comment: 6 pages, 4 figures, to be presented at the 5th International + Conference on Electrical, Communication and Computer Engineering (ICECCE) +
+
+
+
+
+ + ☆ MAPX: An explainable model-agnostic framework for the detection of false + information on social media networks + + +
+ The automated detection of false information has become a fundamental task in +combating the spread of "fake news" on online social media networks (OSMN) as +it reduces the need for manual discernment by individuals. In the literature, +leveraging various content or context features of OSMN documents have been +found useful. However, most of the existing detection models often utilise +these features in isolation without regard to the temporal and dynamic changes +oft-seen in reality, thus, limiting the robustness of the models. Furthermore, +there has been little to no consideration of the impact of the quality of +documents' features on the trustworthiness of the final prediction. In this +paper, we introduce a novel model-agnostic framework, called MAPX, which allows +evidence based aggregation of predictions from existing models in an +explainable manner. Indeed, the developed aggregation method is adaptive, +dynamic and considers the quality of OSMN document features. Further, we +perform extensive experiments on benchmarked fake news datasets to demonstrate +the effectiveness of MAPX using various real-world data quality scenarios. Our +empirical results show that the proposed framework consistently outperforms all +state-of-the-art models evaluated. For reproducibility, a demo of MAPX is +available at \href{https://github.com/SCondran/MAPX_framework}{this link} + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Optimal Classification-based Anomaly Detection with Neural Networks: + Theory and Practice + + +
+ Anomaly detection is an important problem in many application areas, such as +network security. Many deep learning methods for unsupervised anomaly detection +produce good empirical performance but lack theoretical guarantees. By casting +anomaly detection into a binary classification problem, we establish +non-asymptotic upper bounds and a convergence rate on the excess risk on +rectified linear unit (ReLU) neural networks trained on synthetic anomalies. +Our convergence rate on the excess risk matches the minimax optimal rate in the +literature. Furthermore, we provide lower and upper bounds on the number of +synthetic anomalies that can attain this optimality. For practical +implementation, we relax some conditions to improve the search for the +empirical risk minimizer, which leads to competitive performance to other +classification-based methods for anomaly detection. Overall, our work provides +the first theoretical guarantees of unsupervised neural network-based anomaly +detectors and empirical insights on how to design them well. + +
+
+
+
+
+ + ☆ Anytime Continual Learning for Open Vocabulary Classification ECCV 2024 + + +
+ We propose an approach for anytime continual learning (AnytimeCL) for open +vocabulary image classification. The AnytimeCL problem aims to break away from +batch training and rigid models by requiring that a system can predict any set +of labels at any time and efficiently update and improve when receiving one or +more training samples at any time. Despite the challenging goal, we achieve +substantial improvements over recent methods. We propose a dynamic weighting +between predictions of a partially fine-tuned model and a fixed open vocabulary +model that enables continual improvement when training samples are available +for a subset of a task's labels. We also propose an attention-weighted PCA +compression of training features that reduces storage and computation with +little impact to model accuracy. Our methods are validated with experiments +that test flexibility of learning and inference. Code is available at +https://github.com/jessemelpolio/AnytimeCL. + +
+
+ comment: To appear at ECCV 2024 as Oral presentation +
+
+
+
+
+ + ☆ Enhancing Privacy in ControlNet and Stable Diffusion via Split Learning + + +
+ With the emerging trend of large generative models, ControlNet is introduced +to enable users to fine-tune pre-trained models with their own data for various +use cases. A natural question arises: how can we train ControlNet models while +ensuring users' data privacy across distributed devices? Exploring different +distributed training schemes, we find conventional federated learning and split +learning unsuitable. Instead, we propose a new distributed learning structure +that eliminates the need for the server to send gradients back. Through a +comprehensive evaluation of existing threats, we discover that in the context +of training ControlNet with split learning, most existing attacks are +ineffective, except for two mentioned in previous literature. To counter these +threats, we leverage the properties of diffusion models and design a new +timestep sampling policy during forward processes. We further propose a +privacy-preserving activation function and a method to prevent private text +prompts from leaving clients, tailored for image generation with diffusion +models. Our experimental results demonstrate that our algorithms and systems +greatly enhance the efficiency of distributed training for ControlNet while +ensuring users' data privacy without compromising image generation quality. + +
+
+
+
+
+ + ☆ Sub-graph Based Diffusion Model for Link Prediction + + +
+ Denoising Diffusion Probabilistic Models (DDPMs) represent a contemporary +class of generative models with exceptional qualities in both synthesis and +maximizing the data likelihood. These models work by traversing a forward +Markov Chain where data is perturbed, followed by a reverse process where a +neural network learns to undo the perturbations and recover the original data. +There have been increasing efforts exploring the applications of DDPMs in the +graph domain. However, most of them have focused on the generative perspective. +In this paper, we aim to build a novel generative model for link prediction. In +particular, we treat link prediction between a pair of nodes as a conditional +likelihood estimation of its enclosing sub-graph. With a dedicated design to +decompose the likelihood estimation process via the Bayesian formula, we are +able to separate the estimation of sub-graph structure and its node features. +Such designs allow our model to simultaneously enjoy the advantages of +inductive learning and the strong generalization capability. Remarkably, +comprehensive experiments across various datasets validate that our proposed +method presents numerous advantages: (1) transferability across datasets +without retraining, (2) promising generalization on limited training data, and +(3) robustness against graph adversarial attacks. + +
+
+ comment: 17 pages, 3 figures +
+
+
+
+
+ + ☆ Risks When Sharing LoRA Fine-Tuned Diffusion Model Weights + + +
+ With the emerging trend in generative models and convenient public access to +diffusion models pre-trained on large datasets, users can fine-tune these +models to generate images of personal faces or items in new contexts described +by natural language. Parameter efficient fine-tuning (PEFT) such as Low Rank +Adaptation (LoRA) has become the most common way to save memory and computation +usage on the user end during fine-tuning. However, a natural question is +whether the private images used for fine-tuning will be leaked to adversaries +when sharing model weights. In this paper, we study the issue of privacy +leakage of a fine-tuned diffusion model in a practical setting, where +adversaries only have access to model weights, rather than prompts or images +used for fine-tuning. We design and build a variational network autoencoder +that takes model weights as input and outputs the reconstruction of private +images. To improve the efficiency of training such an autoencoder, we propose a +training paradigm with the help of timestep embedding. The results give a +surprising answer to this research question: an adversary can generate images +containing the same identities as the private images. Furthermore, we +demonstrate that no existing defense method, including differential +privacy-based methods, can preserve the privacy of private data used for +fine-tuning a diffusion model without compromising the utility of a fine-tuned +model. + +
+
+
+
+
+ + ☆ Integrating Neural Operators with Diffusion Models Improves Spectral + Representation in Turbulence Modeling + + +
+ We integrate neural operators with diffusion models to address the spectral +limitations of neural operators in surrogate modeling of turbulent flows. While +neural operators offer computational efficiency, they exhibit deficiencies in +capturing high-frequency flow dynamics, resulting in overly smooth +approximations. To overcome this, we condition diffusion models on neural +operators to enhance the resolution of turbulent structures. Our approach is +validated for different neural operators on diverse datasets, including a high +Reynolds number jet flow simulation and experimental Schlieren velocimetry. The +proposed method significantly improves the alignment of predicted energy +spectra with true distributions compared to neural operators alone. +Additionally, proper orthogonal decomposition analysis demonstrates enhanced +spectral fidelity in space-time. This work establishes a new paradigm for +combining generative models with neural operators to advance surrogate modeling +of turbulent systems, and it can be used in other scientific applications that +involve microstructure and high-frequency content. See our project page: +vivekoommen.github.io/NO_DM + +
+
+
+
+
+ + ☆ Rethinking Meta-Learning from a Learning Lens + + +
+ Meta-learning has emerged as a powerful approach for leveraging knowledge +from previous tasks to solve new tasks. The mainstream methods focus on +training a well-generalized model initialization, which is then adapted to +different tasks with limited data and updates. However, it pushes the model +overfitting on the training tasks. Previous methods mainly attributed this to +the lack of data and used augmentations to address this issue, but they were +limited by sufficient training and effective augmentation strategies. In this +work, we focus on the more fundamental ``learning to learn'' strategy of +meta-learning to explore what causes errors and how to eliminate these errors +without changing the environment. Specifically, we first rethink the +algorithmic procedure of meta-learning from a ``learning'' lens. Through +theoretical and empirical analyses, we find that (i) this paradigm faces the +risk of both overfitting and underfitting and (ii) the model adapted to +different tasks promote each other where the effect is stronger if the tasks +are more similar. Based on this insight, we propose using task relations to +calibrate the optimization process of meta-learning and propose a plug-and-play +method called Task Relation Learner (TRLearner) to achieve this goal. +Specifically, it first obtains task relation matrices from the extracted +task-specific meta-data. Then, it uses the obtained matrices with +relation-aware consistency regularization to guide optimization. Extensive +theoretical and empirical analyses demonstrate the effectiveness of TRLearner. + +
+
+
+
+
+ + ☆ Improved Finite-Particle Convergence Rates for Stein Variational + Gradient Descent + + +
+ We provide finite-particle convergence rates for the Stein Variational +Gradient Descent (SVGD) algorithm in the Kernel Stein Discrepancy +($\mathsf{KSD}$) and Wasserstein-2 metrics. Our key insight is the observation +that the time derivative of the relative entropy between the joint density of +$N$ particle locations and the $N$-fold product target measure, starting from a +regular initial distribution, splits into a dominant `negative part' +proportional to $N$ times the expected $\mathsf{KSD}^2$ and a smaller `positive +part'. This observation leads to $\mathsf{KSD}$ rates of order $1/\sqrt{N}$, +providing a near optimal double exponential improvement over the recent result +by~\cite{shi2024finite}. Under mild assumptions on the kernel and potential, +these bounds also grow linearly in the dimension $d$. By adding a bilinear +component to the kernel, the above approach is used to further obtain +Wasserstein-2 convergence. For the case of `bilinear + Mat\'ern' kernels, we +derive Wasserstein-2 rates that exhibit a curse-of-dimensionality similar to +the i.i.d. setting. We also obtain marginal convergence and long-time +propagation of chaos results for the time-averaged particle laws. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Explaining Datasets in Words: Statistical Models with Natural Language + Parameters + + +
+ To make sense of massive data, we often fit simplified models and then +interpret the parameters; for example, we cluster the text embeddings and then +interpret the mean parameters of each cluster. However, these parameters are +often high-dimensional and hard to interpret. To make model parameters directly +interpretable, we introduce a family of statistical models -- including +clustering, time series, and classification models -- parameterized by natural +language predicates. For example, a cluster of text about COVID could be +parameterized by the predicate "discusses COVID". To learn these statistical +models effectively, we develop a model-agnostic algorithm that optimizes +continuous relaxations of predicate parameters with gradient descent and +discretizes them by prompting language models (LMs). Finally, we apply our +framework to a wide range of problems: taxonomizing user chat dialogues, +characterizing how they evolve across time, finding categories where one +language model is better than the other, clustering math problems based on +subareas, and explaining visual features in memorable images. Our framework is +highly versatile, applicable to both textual and visual domains, can be easily +steered to focus on specific properties (e.g. subareas), and explains +sophisticated concepts that classical methods (e.g. n-gram analysis) struggle +to produce. + +
+
+
+
+
+ + ☆ Input-to-State Stable Coupled Oscillator Networks for Closed-form + Model-based Control in Latent Space + + +
+ Even though a variety of methods (e.g., RL, MPC, LQR) have been proposed in +the literature, efficient and effective latent-space control of physical +systems remains an open challenge. A promising avenue would be to leverage +powerful and well-understood closed-form strategies from control theory +literature in combination with learned dynamics, such as potential-energy +shaping. We identify three fundamental shortcomings in existing latent-space +models that have so far prevented this powerful combination: (i) they lack the +mathematical structure of a physical system, (ii) they do not inherently +conserve the stability properties of the real systems. Furthermore, (iii) these +methods do not have an invertible mapping between input and latent-space +forcing. This work proposes a novel Coupled Oscillator Network (CON) model that +simultaneously tackles all these issues. More specifically, (i) we show +analytically that CON is a Lagrangian system - i.e., it presses well-defined +potential and kinetic energy terms. Then, (ii) we provide formal proof of +global Input-to-State stability using Lyapunov arguments. Moving to the +experimental side, (iii) we demonstrate that CON reaches SoA performance when +learning complex nonlinear dynamics of mechanical systems directly from images. +An additional methodological innovation contributing to achieving this third +goal is an approximated closed-form solution for efficient integration of +network dynamics, which eases efficient training. We tackle (iv) by +approximating the forcing-to-input mapping with a decoder that is trained to +reconstruct the input based on the encoded latent space force. Finally, we +leverage these four properties and show that they enable latent-space control. +We use an integral-saturated PID with potential force compensation and +demonstrate high-quality performance on a soft robot using raw pixels as the +only feedback information. + +
+
+ comment: 41 pages, currently under review +
+
+
+
+
+ + ☆ Predictive Control and Regret Analysis of Non-Stationary MDP with + Look-ahead Information + + +
+ Policy design in non-stationary Markov Decision Processes (MDPs) is +inherently challenging due to the complexities introduced by time-varying +system transition and reward, which make it difficult for learners to determine +the optimal actions for maximizing cumulative future rewards. Fortunately, in +many practical applications, such as energy systems, look-ahead predictions are +available, including forecasts for renewable energy generation and demand. In +this paper, we leverage these look-ahead predictions and propose an algorithm +designed to achieve low regret in non-stationary MDPs by incorporating such +predictions. Our theoretical analysis demonstrates that, under certain +assumptions, the regret decreases exponentially as the look-ahead window +expands. When the system prediction is subject to error, the regret does not +explode even if the prediction error grows sub-exponentially as a function of +the prediction horizon. We validate our approach through simulations, +confirming the efficacy of our algorithm in non-stationary environments. + +
+
+
+
+
+ + ♻ ☆ Policy Optimization finds Nash Equilibrium in Regularized General-Sum LQ + Games + + +
+ In this paper, we investigate the impact of introducing relative entropy +regularization on the Nash Equilibria (NE) of General-Sum $N$-agent games, +revealing the fact that the NE of such games conform to linear Gaussian +policies. Moreover, it delineates sufficient conditions, contingent upon the +adequacy of entropy regularization, for the uniqueness of the NE within the +game. As Policy Optimization serves as a foundational approach for +Reinforcement Learning (RL) techniques aimed at finding the NE, in this work we +prove the linear convergence of a policy optimization algorithm which (subject +to the adequacy of entropy regularization) is capable of provably attaining the +NE. Furthermore, in scenarios where the entropy regularization proves +insufficient, we present a $\delta$-augmentation technique, which facilitates +the achievement of an $\epsilon$-NE within the game. + +
+
+ comment: Accepted for Conference on Decision and Control 2024 +
+
+
+
+
+ + ♻ ☆ Fairness-enhancing mixed effects deep learning improves fairness on in- + and out-of-distribution clustered (non-iid) data + + +
+ Traditional deep learning (DL) models face two key challenges. First, they +assume training samples are independent and identically distributed, an +assumption often violated in real-world datasets where samples are grouped by +shared measurements (e.g., participants or cells). This leads to performance +degradation, limited generalization, and confounding issues, causing Type 1 and +Type 2 errors. Second, DL models typically prioritize overall accuracy, often +overlooking fairness across underrepresented groups, leading to biased outcomes +in critical areas such as loan approvals and healthcare decisions. To address +these issues, we introduce the Fair Mixed Effects Deep Learning (Fair MEDL) +framework. Fair MEDL quantifies cluster-invariant fixed effects (FE) and +cluster-specific random effects (RE) through 1) a cluster adversary for +learning invariant FE, 2) a Bayesian neural network for RE, and 3) a mixing +function combining FE and RE for final predictions. Additionally, we +incorporate adversarial debiasing to promote fairness across three key metrics: +Equalized Odds, Demographic Parity, and Counterfactual Fairness. Our method +also identifies and de-weights confounding probes, improving interpretability. +Evaluated on three datasets from finance and healthcare, Fair MEDL improves +fairness by up to 73% for age, 47% for race, 83% for sex, and 26% for marital +status, while maintaining robust predictive performance. Our implementation is +publicly available on GitHub. + +
+
+
+
+
+ + ♻ ☆ IoTCO2: Assessing the End-To-End Carbon Footprint of + Internet-of-Things-Enabled Deep Learning + + +
+ To improve privacy and ensure quality-of-service (QoS), deep learning (DL) +models are increasingly deployed on Internet of Things (IoT) devices for data +processing, significantly increasing the carbon footprint associated with DL on +IoT, covering both operational and embodied aspects. Existing operational +energy predictors often overlook quantized DL models and emerging neural +processing units (NPUs), while embodied carbon footprint modeling tools neglect +non-computing hardware components common in IoT devices, creating a gap in +accurate carbon footprint modeling tools for IoT-enabled DL. This paper +introduces \textit{\carb}, an end-to-end tool for precise carbon footprint +estimation in IoT-enabled DL, with deviations as low as 5\% for operational and +3.23\% for embodied carbon footprints compared to actual measurements across +various DL models. Additionally, practical applications of \carb~are showcased +through multiple user case studies. + +
+
+ comment: 5 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ CoverUp: Coverage-Guided LLM-Based Test Generation + + +
+ Testing is an essential part of software development. Test generation tools +attempt to automate the otherwise labor-intensive task of test creation, but +generating high-coverage tests remains a challenge. This paper proposes +CoverUp, a novel approach to driving the generation of high-coverage Python +regression tests. CoverUp iteratively improves test coverage, interleaving +coverage analysis with dialogs with the LLM that steer it to refine tests so +that they increase coverage of lines and branches. We evaluate our prototype +CoverUp implementation across a benchmark of challenging code derived from +open-source Python projects, and show that CoverUp substantially improves on +the state of the art. Compared to CodaMosa, a hybrid search/LLM-based test +generator, CoverUp achieves a per-module median line+branch coverage of 80% +(vs. 47%). Compared to MuTAP, a mutation/LLM-based test generator, CoverUp +achieves an overall line+branch coverage of 90% (vs. 77%). We show that +CoverUp's iterative, coverage-guided approach is crucial to its effectiveness, +contributing to nearly 40% of its successes. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ Does a Neural Network Really Encode Symbolic Concepts? + + +
+ Recently, a series of studies have tried to extract interactions between +input variables modeled by a DNN and define such interactions as concepts +encoded by the DNN. However, strictly speaking, there still lacks a solid +guarantee whether such interactions indeed represent meaningful concepts. +Therefore, in this paper, we examine the trustworthiness of interaction +concepts from four perspectives. Extensive empirical studies have verified that +a well-trained DNN usually encodes sparse, transferable, and discriminative +concepts, which is partially aligned with human intuition. + +
+
+
+
+
+ + ♻ ☆ MLP, XGBoost, KAN, TDNN, and LSTM-GRU Hybrid RNN with Attention for SPX + and NDX European Call Option Pricing + + +
+ We explore the performance of various artificial neural network +architectures, including a multilayer perceptron (MLP), Kolmogorov-Arnold +network (KAN), LSTM-GRU hybrid recursive neural network (RNN) models, and a +time-delay neural network (TDNN) for pricing European call options. In this +study, we attempt to leverage the ability of supervised learning methods, such +as ANNs, KANs, and gradient-boosted decision trees, to approximate complex +multivariate functions in order to calibrate option prices based on past market +data. The motivation for using ANNs and KANs is the Universal Approximation +Theorem and Kolmogorov-Arnold Representation Theorem, respectively. +Specifically, we use S\&P 500 (SPX) and NASDAQ 100 (NDX) index options traded +during 2015-2023 with times to maturity ranging from 15 days to over 4 years +(OptionMetrics IvyDB US dataset). Black \& Scholes's (BS) PDE \cite{Black1973} +model's performance in pricing the same options compared to real data is used +as a benchmark. This model relies on strong assumptions, and it has been +observed and discussed in the literature that real data does not match its +predictions. Supervised learning methods are widely used as an alternative for +calibrating option prices due to some of the limitations of this model. In our +experiments, the BS model underperforms compared to all of the others. Also, +the best TDNN model outperforms the best MLP model on all error metrics. We +implement a simple self-attention mechanism to enhance the RNN models, +significantly improving their performance. The best-performing model overall is +the LSTM-GRU hybrid RNN model with attention. Also, the KAN model outperforms +the TDNN and MLP models. We analyze the performance of all models by ticker, +moneyness category, and over/under/correctly-priced percentage. + +
+
+ comment: 78 pages, 39 figures +
+
+
+
+
+ + ♻ ☆ Understanding How CodeLLMs (Mis)Predict Types with Activation Steering + + +
+ CodeLLMs are transforming software development as we know it. This is +especially true for tasks where rule-based approaches fall short, like type +prediction. The type prediction task consists in adding a new type annotation +to a partially typed program, such that the resulting program is closer to +being fully typed. The intractability of rule-based approaches and high cost of +manual annotation make CodeLLMs an attractive solution to the problem. However, +CodeLLMs are still far from being deployed on the large-scale due to doubts +surrounding their reliability. + To shed some light on how CodeLLMs approach type prediction, we investigate +what happens when a model mispredicts a type. We show that by applying +semantics-preserving edits to code, CodeLLMs are eventually misled into +mispredicting type annotations. However, by leveraging activation steering we +are able to "steer" the model back to the correct prediction, making models +more robust against semantically irrelevant prompt features. We show that +steering achieves comparable performance to fine-tuning directly on the type +prediction task. Furthermore, we find that steering vectors computed from +Python code are effective at correcting TypeScript mispredictions, and vice +versa. To our knowledge, this is the first evidence of its kind to suggest that +CodeLLMs learn task representations that transfer across languages. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Linear Attention is Enough in Spatial-Temporal Forecasting + + +
+ As the most representative scenario of spatial-temporal forecasting tasks, +the traffic forecasting task attracted numerous attention from machine learning +community due to its intricate correlation both in space and time dimension. +Existing methods often treat road networks over time as spatial-temporal +graphs, addressing spatial and temporal representations independently. However, +these approaches struggle to capture the dynamic topology of road networks, +encounter issues with message passing mechanisms and over-smoothing, and face +challenges in learning spatial and temporal relationships separately. To +address these limitations, we propose treating nodes in road networks at +different time steps as independent spatial-temporal tokens and feeding them +into a vanilla Transformer to learn complex spatial-temporal patterns, design +\textbf{STformer} achieving SOTA. Given its quadratic complexity, we introduce +a variant \textbf{NSTformer} based on Nystr$\ddot{o}$m method to approximate +self-attention with linear complexity but even slightly better than former in a +few cases astonishingly. Extensive experimental results on traffic datasets +demonstrate that the proposed method achieves state-of-the-art performance at +an affordable computational cost. Our code is available at +\href{https://github.com/XinyuNing/STformer-and-NSTformer}{https://github.com/XinyuNing/STformer-and-NSTformer}. + +
+
+
+
+
+ + ♻ ☆ The Role of Explainable AI in Revolutionizing Human Health Monitoring + + +
+ The complex nature of disease mechanisms and the variability of patient +symptoms present significant obstacles in developing effective diagnostic +tools. Although machine learning has made considerable advances in medical +diagnosis, its decision-making processes frequently lack transparency, which +can jeopardize patient outcomes. This underscores the critical need for +Explainable AI (XAI), which not only offers greater clarity but also has the +potential to significantly improve patient care. In this literature review, we +conduct a detailed analysis of analyzing XAI methods identified through +searches across various databases, focusing on chronic conditions such as +Parkinson's, stroke, depression, cancer, heart disease, and Alzheimer's +disease. The literature search revealed the application of 9 trending XAI +algorithms in the field of healthcare and highlighted the pros and cons of each +of them. Thus, the article is concluded with a critical appraisal of the +challenges and future research opportunities for XAI in human health +monitoring. + +
+
+
+
+
+ + ♻ ☆ Four Facets of Forecast Felicity: Calibration, Predictiveness, + Randomness and Regret + + +
+ Machine learning is about forecasting. Forecasts, however, obtain their +usefulness only through their evaluation. Machine learning has traditionally +focused on types of losses and their corresponding regret. Currently, the +machine learning community regained interest in calibration. In this work, we +show the conceptual equivalence of calibration and regret in evaluating +forecasts. We frame the evaluation problem as a game between a forecaster, a +gambler and nature. Putting intuitive restrictions on gambler and forecaster, +calibration and regret naturally fall out of the framework. In addition, this +game links evaluation of forecasts to randomness of outcomes. Random outcomes +with respect to forecasts are equivalent to good forecasts with respect to +outcomes. We call those dual aspects, calibration and regret, predictiveness +and randomness, the four facets of forecast felicity. + +
+
+
+
+
+ + ♻ ☆ Event Detection in Time Series: Universal Deep Learning Approach + + +
+ Event detection in time series is a challenging task due to the prevalence of +imbalanced datasets, rare events, and time interval-defined events. Traditional +supervised deep learning methods primarily employ binary classification, where +each time step is assigned a binary label indicating the presence or absence of +an event. However, these methods struggle to handle these specific scenarios +effectively. To address these limitations, we propose a novel supervised +regression-based deep learning approach that offers several advantages over +classification-based methods. Our approach, with a limited number of +parameters, can effectively handle various types of events within a unified +framework, including rare events and imbalanced datasets. We provide +theoretical justifications for its universality and precision and demonstrate +its superior performance across diverse domains, particularly for rare events +and imbalanced datasets. + +
+
+
+
+
+ + ♻ ☆ Revisiting Convergence of AdaGrad with Relaxed Assumptions UAI 2024 + + +
+ In this study, we revisit the convergence of AdaGrad with momentum (covering +AdaGrad as a special case) on non-convex smooth optimization problems. We +consider a general noise model where the noise magnitude is controlled by the +function value gap together with the gradient magnitude. This model encompasses +a broad range of noises including bounded noise, sub-Gaussian noise, affine +variance noise and the expected smoothness, and it has been shown to be more +realistic in many practical applications. Our analysis yields a probabilistic +convergence rate which, under the general noise, could reach at +(\tilde{\mathcal{O}}(1/\sqrt{T})). This rate does not rely on prior knowledge +of problem-parameters and could accelerate to (\tilde{\mathcal{O}}(1/T)) where +(T) denotes the total number iterations, when the noise parameters related to +the function value gap and noise level are sufficiently small. The convergence +rate thus matches the lower rate for stochastic first-order methods over +non-convex smooth landscape up to logarithm terms [Arjevani et al., 2023]. We +further derive a convergence bound for AdaGrad with mometum, considering the +generalized smoothness where the local smoothness is controlled by a +first-order function of the gradient norm. + +
+
+ comment: Accepted by UAI 2024 +
+
+
+
+
+ + ♻ ☆ Performance Law of Large Language Models + + +
+ Guided by the belief of the scaling law, large language models (LLMs) have +achieved impressive performance in recent years. However, scaling law only +gives a qualitative estimation of loss, which is influenced by various factors +such as model architectures, data distributions, tokenizers, and computation +precision. Thus, estimating the real performance of LLMs with different +training settings rather than loss may be quite useful in practical +development. In this article, we present an empirical equation named +"Performance Law" to directly predict the MMLU score of an LLM, which is a +widely used metric to indicate the general capability of LLMs in real-world +conversations and applications. Based on only a few key hyperparameters of the +LLM architecture and the size of training data, we obtain a quite accurate MMLU +prediction of various LLMs with diverse sizes and architectures developed by +different organizations in different years. Performance law can be used to +guide the choice of LLM architecture and the effective allocation of +computational resources without extensive experiments. + +
+
+ comment: Personal opinions of the authors +
+
+
+
+
+ + ♻ ☆ Defining and Extracting generalizable interaction primitives from DNNs + + +
+ Faithfully summarizing the knowledge encoded by a deep neural network (DNN) +into a few symbolic primitive patterns without losing much information +represents a core challenge in explainable AI. To this end, Ren et al. (2024) +have derived a series of theorems to prove that the inference score of a DNN +can be explained as a small set of interactions between input variables. +However, the lack of generalization power makes it still hard to consider such +interactions as faithful primitive patterns encoded by the DNN. Therefore, +given different DNNs trained for the same task, we develop a new method to +extract interactions that are shared by these DNNs. Experiments show that the +extracted interactions can better reflect common knowledge shared by different +DNNs. + +
+
+
+
+
+ + ♻ ☆ XAMI -- A Benchmark Dataset for Artefact Detection in XMM-Newton Optical + Images SP + + +
+ Reflected or scattered light produce artefacts in astronomical observations +that can negatively impact the scientific study. Hence, automated detection of +these artefacts is highly beneficial, especially with the increasing amounts of +data gathered. Machine learning methods are well-suited to this problem, but +currently there is a lack of annotated data to train such approaches to detect +artefacts in astronomical observations. In this work, we present a dataset of +images from the XMM-Newton space telescope Optical Monitoring camera showing +different types of artefacts. We hand-annotated a sample of 1000 images with +artefacts which we use to train automated ML methods. We further demonstrate +techniques tailored for accurate detection and masking of artefacts using +instance segmentation. We adopt a hybrid approach, combining knowledge from +both convolutional neural networks (CNNs) and transformer-based models and use +their advantages in segmentation. The presented method and dataset will advance +artefact detection in astronomical observations by providing a reproducible +baseline. All code and data are made available +(https://github.com/ESA-Datalabs/XAMI-model and +https://github.com/ESA-Datalabs/XAMI-dataset). + +
+
+ comment: Accepted for oral presentation at SPAICE 2024 +
+
+
+
+
+ + ♻ ☆ A Passivity-Based Method for Accelerated Convex Optimisation + + +
+ This study presents a constructive methodology for designing accelerated +convex optimisation algorithms in continuous-time domain. The two key enablers +are the classical concept of passivity in control theory and the time-dependent +change of variables that maps the output of the internal dynamic system to the +optimisation variables. The Lyapunov function associated with the optimisation +dynamics is obtained as a natural consequence of specifying the internal +dynamics that drives the state evolution as a passive linear time-invariant +system. The passivity-based methodology provides a general framework that has +the flexibility to generate convex optimisation algorithms with the guarantee +of different convergence rate bounds on the objective function value. The same +principle applies to the design of online parameter update algorithms for +adaptive control by re-defining the output of internal dynamics to allow for +the feedback interconnection with tracking error dynamics. + +
+
+ comment: 10 pages, 1 figure, accepted for presentation at 2024 IEEE CDC +
+
+
+
+
+ + ♻ ☆ Exploring the Links between the Fundamental Lemma and Kernel Regression + + +
+ Generalizations and variations of the fundamental lemma by Willems et al. are +an active topic of recent research. In this note, we explore and formalize the +links between kernel regression and some known nonlinear extensions of the +fundamental lemma. Applying a transformation to the usual linear equation in +Hankel matrices, we arrive at an alternative implicit kernel representation of +the system trajectories while keeping the requirements on persistency of +excitation. We show that this representation is equivalent to the solution of a +specific kernel regression problem. We explore the possible structures of the +underlying kernel as well as the system classes to which they correspond. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Diverse Neural Audio Embeddings -- Bringing Features back ! ICASSP 2025 + + +
+ With the advent of modern AI architectures, a shift has happened towards +end-to-end architectures. This pivot has led to neural architectures being +trained without domain-specific biases/knowledge, optimized according to the +task. We in this paper, learn audio embeddings via diverse feature +representations, in this case, domain-specific. For the case of audio +classification over hundreds of categories of sound, we learn robust separate +embeddings for diverse audio properties such as pitch, timbre, and neural +representation, along with also learning it via an end-to-end architecture. We +observe handcrafted embeddings, e.g., pitch and timbre-based, although on their +own, are not able to beat a fully end-to-end representation, yet adding these +together with end-to-end embedding helps us, significantly improve performance. +This work would pave the way to bring some domain expertise with end-to-end +models to learn robust, diverse representations, surpassing the performance of +just training end-to-end models. + +
+
+ comment: 6 pages, 1 figure, 2 table, Under Review for 50th IEEE ICASSP 2025, + Hyderabad, India +
+
+
+
+
+ + ♻ ☆ RePlay: a Recommendation Framework for Experimentation and Production + Use + + +
+ Using a single tool to build and compare recommender systems significantly +reduces the time to market for new models. In addition, the comparison results +when using such tools look more consistent. This is why many different tools +and libraries for researchers in the field of recommendations have recently +appeared. Unfortunately, most of these frameworks are aimed primarily at +researchers and require modification for use in production due to the inability +to work on large datasets or an inappropriate architecture. In this demo, we +present our open-source toolkit RePlay - a framework containing an end-to-end +pipeline for building recommender systems, which is ready for production use. +RePlay also allows you to use a suitable stack for the pipeline on each stage: +Pandas, Polars, or Spark. This allows the library to scale computations and +deploy to a cluster. Thus, RePlay allows data scientists to easily move from +research mode to production mode using the same interfaces. + +
+
+
+
+
+ + ♻ ☆ Quantum-secure multiparty deep learning + + +
+ Secure multiparty computation enables the joint evaluation of multivariate +functions across distributed users while ensuring the privacy of their local +inputs. This field has become increasingly urgent due to the exploding demand +for computationally intensive deep learning inference. These computations are +typically offloaded to cloud computing servers, leading to vulnerabilities that +can compromise the security of the clients' data. To solve this problem, we +introduce a linear algebra engine that leverages the quantum nature of light +for information-theoretically secure multiparty computation using only +conventional telecommunication components. We apply this linear algebra engine +to deep learning and derive rigorous upper bounds on the information leakage of +both the deep neural network weights and the client's data via the Holevo and +the Cram\'er-Rao bounds, respectively. Applied to the MNIST classification +task, we obtain test accuracies exceeding $96\%$ while leaking less than $0.1$ +bits per weight symbol and $0.01$ bits per data symbol. This weight leakage is +an order of magnitude below the minimum bit precision required for accurate +deep learning using state-of-the-art quantization techniques. Our work lays the +foundation for practical quantum-secure computation and unlocks secure cloud +deep learning as a field. + +
+
+
+
+
+ + ♻ ☆ Exploring the Effectiveness of Object-Centric Representations in Visual + Question Answering: Comparative Insights with Foundation Models + + +
+ Object-centric (OC) representations, which represent the state of a visual +scene by modeling it as a composition of objects, have the potential to be used +in various downstream tasks to achieve systematic compositional generalization +and facilitate reasoning. However, these claims have not been thoroughly +analyzed yet. Recently, foundation models have demonstrated unparalleled +capabilities across diverse domains from language to computer vision, marking +them as a potential cornerstone of future research for a multitude of +computational tasks. In this paper, we conduct an extensive empirical study on +representation learning for downstream Visual Question Answering (VQA), which +requires an accurate compositional understanding of the scene. We thoroughly +investigate the benefits and trade-offs of OC models and alternative approaches +including large pre-trained foundation models on both synthetic and real-world +data, and demonstrate a viable way to achieve the best of both worlds. The +extensiveness of our study, encompassing over 800 downstream VQA models and 15 +different types of upstream representations, also provides several additional +insights that we believe will be of interest to the community at large. + +
+
+
+
+
+ + ♻ ☆ A Methodology to Study the Impact of Spiking Neural Network Parameters + considering Event-Based Automotive Data + + +
+ Autonomous Driving (AD) systems are considered as the future of human +mobility and transportation. Solving computer vision tasks such as image +classification and object detection/segmentation, with high accuracy and low +power/energy consumption, is highly needed to realize AD systems in real life. +These requirements can potentially be satisfied by Spiking Neural Networks +(SNNs). However, the state-of-the-art works in SNN-based AD systems still focus +on proposing network models that can achieve high accuracy, and they have not +systematically studied the roles of SNN parameters when used for learning +event-based automotive data. Therefore, we still lack understanding of how to +effectively develop SNN models for AD systems. Toward this, we propose a novel +methodology to systematically study and analyze the impact of SNN parameters +considering event-based automotive data, then leverage this analysis for +enhancing SNN developments. To do this, we first explore different settings of +SNN parameters that directly affect the learning mechanism (i.e., batch size, +learning rate, neuron threshold potential, and weight decay), then analyze the +accuracy results. Afterward, we propose techniques that jointly improve SNN +accuracy and reduce training time. Experimental results show that our +methodology can improve the SNN models for AD systems than the +state-of-the-art, as it achieves higher accuracy (i.e., 86%) for the NCARS +dataset, and it can also achieve iso-accuracy (i.e., ~85% with standard +deviation less than 0.5%) while speeding up the training time by 1.9x. In this +manner, our research work provides a set of guidelines for SNN parameter +enhancements, thereby enabling the practical developments of SNN-based AD +systems. + +
+
+ comment: To appear at the 18th International Conference on Control, + Automation, Robotics and Vision (ICARCV), December 2024, Dubai, UAE +
+
+
+
+
+ + ♻ ☆ SpanSeq: Similarity-based sequence data splitting method for improved + development and assessment of deep learning projects + + +
+ The use of deep learning models in computational biology has increased +massively in recent years, and it is expected to continue with the current +advances in the fields such as Natural Language Processing. These models, +although able to draw complex relations between input and target, are also +inclined to learn noisy deviations from the pool of data used during their +development. In order to assess their performance on unseen data (their +capacity to generalize), it is common to split the available data randomly into +development (train/validation) and test sets. This procedure, although +standard, has been shown to produce dubious assessments of generalization due +to the existing similarity between samples in the databases used. In this work, +we present SpanSeq, a database partition method for machine learning that can +scale to most biological sequences (genes, proteins and genomes) in order to +avoid data leakage between sets. We also explore the effect of not restraining +similarity between sets by reproducing the development of two state-of-the-art +models on bioinformatics, not only confirming the consequences of randomly +splitting databases on the model assessment, but expanding those repercussions +to the model development. SpanSeq is available at +https://github.com/genomicepidemiology/SpanSeq. + +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Edge Federated Learning for Intelligent Mobile-Health + Systems + + +
+ Machine Learning (ML) algorithms are generally designed for scenarios in +which all data is stored in one data center, where the training is performed. +However, in many applications, e.g., in the healthcare domain, the training +data is distributed among several entities, e.g., different hospitals or +patients' mobile devices/sensors. At the same time, transferring the data to a +central location for learning is certainly not an option, due to privacy +concerns and legal issues, and in certain cases, because of the communication +and computation overheads. Federated Learning (FL) is the state-of-the-art +collaborative ML approach for training an ML model across multiple parties +holding local data samples, without sharing them. However, enabling learning +from distributed data over such edge Internet of Things (IoT) systems (e.g., +mobile-health and wearable technologies, involving sensitive personal/medical +data) in a privacy-preserving fashion presents a major challenge mainly due to +their stringent resource constraints, i.e., limited computing capacity, +communication bandwidth, memory storage, and battery lifetime. In this paper, +we propose a privacy-preserving edge FL framework for resource-constrained +mobile-health and wearable technologies over the IoT infrastructure. We +evaluate our proposed framework extensively and provide the implementation of +our technique on Amazon's AWS cloud platform based on the seizure detection +application in epilepsy monitoring using wearable technologies. + +
+
+
+
+
+ + ♻ ☆ FaçAID: A Transformer Model for Neuro-Symbolic Facade Reconstruction SIGGRAPH + + +
+ We introduce a neuro-symbolic transformer-based model that converts flat, +segmented facade structures into procedural definitions using a custom-designed +split grammar. To facilitate this, we first develop a semi-complex split +grammar tailored for architectural facades and then generate a dataset +comprising of facades alongside their corresponding procedural representations. +This dataset is used to train our transformer model to convert segmented, flat +facades into the procedural language of our grammar. During inference, the +model applies this learned transformation to new facade segmentations, +providing a procedural representation that users can adjust to generate varied +facade designs. This method not only automates the conversion of static facade +images into dynamic, editable procedural formats but also enhances the design +flexibility, allowing for easy modifications. + +
+
+ comment: 11 pages, 11 figures, in ACM SIGGRAPH Asia 2024 Conference Papers + Proceedings +
+
+
+
+
+ + ♻ ☆ RoboGPT: an intelligent agent of making embodied long-term decisions for + daily instruction tasks + + +
+ Robotic agents must master common sense and long-term sequential decisions to +solve daily tasks through natural language instruction. The developments in +Large Language Models (LLMs) in natural language processing have inspired +efforts to use LLMs in complex robot planning. Despite LLMs' great +generalization and comprehension of instruction tasks, LLMs-generated task +plans sometimes lack feasibility and correctness. To address the problem, we +propose a RoboGPT agent\footnote{our code and dataset will be released soon} +for making embodied long-term decisions for daily tasks, with two modules: 1) +LLMs-based planning with re-plan to break the task into multiple sub-goals; 2) +RoboSkill individually designed for sub-goals to learn better navigation and +manipulation skills. The LLMs-based planning is enhanced with a new robotic +dataset and re-plan, called RoboGPT. The new robotic dataset of 67k daily +instruction tasks is gathered for fine-tuning the Llama model and obtaining +RoboGPT. RoboGPT planner with strong generalization can plan hundreds of daily +instruction tasks. Additionally, a low-computational Re-Plan module is designed +to allow plans to flexibly adapt to the environment, thereby addressing the +nomenclature diversity challenge. The proposed RoboGPT agent outperforms SOTA +methods on the ALFRED daily tasks. Moreover, RoboGPT planner exceeds SOTA +LLM-based planners like ChatGPT in task-planning rationality for hundreds of +unseen daily tasks, and even other domain tasks, while keeping the large +model's original broad application and generality. + +
+
+
+
+
+ + ♻ ☆ Dendrites endow artificial neural networks with accurate, robust and + parameter-efficient learning + + +
+ Artificial neural networks (ANNs) are at the core of most Deep learning (DL) +algorithms that successfully tackle complex problems like image recognition, +autonomous driving, and natural language processing. However, unlike biological +brains who tackle similar problems in a very efficient manner, DL algorithms +require a large number of trainable parameters, making them energy-intensive +and prone to overfitting. Here, we show that a new ANN architecture that +incorporates the structured connectivity and restricted sampling properties of +biological dendrites counteracts these limitations. We find that dendritic ANNs +are more robust to overfitting and outperform traditional ANNs on several image +classification tasks while using significantly fewer trainable parameters. +These advantages are likely the result of a different learning strategy, +whereby most of the nodes in dendritic ANNs respond to multiple classes, unlike +classical ANNs that strive for class-specificity. Our findings suggest that the +incorporation of dendritic properties can make learning in ANNs more precise, +resilient, and parameter-efficient and shed new light on how biological +features can impact the learning strategies of ANNs. + +
+
+ comment: 69 pages, 6 main and 11 supplementary figures, 2 main and 3 + supplementary tables +
+
+
+
+
+ + ♻ ☆ The Cost of Arbitrariness for Individuals: Examining the Legal and + Technical Challenges of Model Multiplicity + + +
+ Model multiplicity, the phenomenon where multiple models achieve similar +performance despite different underlying learned functions, introduces +arbitrariness in model selection. While this arbitrariness may seem +inconsequential in expectation, its impact on individuals can be severe. This +paper explores various individual concerns stemming from multiplicity, +including the effects of arbitrariness beyond final predictions, disparate +arbitrariness for individuals belonging to protected groups, and the challenges +associated with the arbitrariness of a single algorithmic system creating a +monopoly across various contexts. It provides both an empirical examination of +these concerns and a comprehensive analysis from the legal standpoint, +addressing how these issues are perceived in the anti-discrimination law in +Canada. We conclude the discussion with technical challenges in the current +landscape of model multiplicity to meet legal requirements and the legal gap +between current law and the implications of arbitrariness in model selection, +highlighting relevant future research directions for both disciplines. + +
+
+ comment: Current version of the paper contains errors in the attribution of + previous work. We are working on creating a new version, which can take a + while and thus are withdrawing this version in the meantime +
+
+
+
+
+ + ♻ ☆ Where We Have Arrived in Proving the Emergence of Sparse Symbolic + Concepts in AI Models + + +
+ This study aims to prove the emergence of symbolic concepts (or more +precisely, sparse primitive inference patterns) in well-trained deep neural +networks (DNNs). Specifically, we prove the following three conditions for the +emergence. (i) The high-order derivatives of the network output with respect to +the input variables are all zero. (ii) The DNN can be used on occluded samples +and when the input sample is less occluded, the DNN will yield higher +confidence. (iii) The confidence of the DNN does not significantly degrade on +occluded samples. These conditions are quite common, and we prove that under +these conditions, the DNN will only encode a relatively small number of sparse +interactions between input variables. Moreover, we can consider such +interactions as symbolic primitive inference patterns encoded by a DNN, because +we show that inference scores of the DNN on an exponentially large number of +randomly masked samples can always be well mimicked by numerical effects of +just a few interactions. + +
+
+
+
+
+ + ♻ ☆ Explaining Generalization Power of a DNN Using Interactive Concepts + + +
+ This paper explains the generalization power of a deep neural network (DNN) +from the perspective of interactions. Although there is no universally accepted +definition of the concepts encoded by a DNN, the sparsity of interactions in a +DNN has been proved, i.e., the output score of a DNN can be well explained by a +small number of interactions between input variables. In this way, to some +extent, we can consider such interactions as interactive concepts encoded by +the DNN. Therefore, in this paper, we derive an analytic explanation of +inconsistency of concepts of different complexities. This may shed new lights +on using the generalization power of concepts to explain the generalization +power of the entire DNN. Besides, we discover that the DNN with stronger +generalization power usually learns simple concepts more quickly and encodes +fewer complex concepts. We also discover the detouring dynamics of learning +complex concepts, which explains both the high learning difficulty and the low +generalization power of complex concepts. The code will be released when the +paper is accepted. + +
+
+
+
+
+ + ♻ ☆ Equivariant Neural Simulators for Stochastic Spatiotemporal Dynamics NeurIPS 2023 + + +
+ Neural networks are emerging as a tool for scalable data-driven simulation of +high-dimensional dynamical systems, especially in settings where numerical +methods are infeasible or computationally expensive. Notably, it has been shown +that incorporating domain symmetries in deterministic neural simulators can +substantially improve their accuracy, sample efficiency, and parameter +efficiency. However, to incorporate symmetries in probabilistic neural +simulators that can simulate stochastic phenomena, we need a model that +produces equivariant distributions over trajectories, rather than equivariant +function approximations. In this paper, we propose Equivariant Probabilistic +Neural Simulation (EPNS), a framework for autoregressive probabilistic modeling +of equivariant distributions over system evolutions. We use EPNS to design +models for a stochastic n-body system and stochastic cellular dynamics. Our +results show that EPNS considerably outperforms existing neural network-based +methods for probabilistic simulation. More specifically, we demonstrate that +incorporating equivariance in EPNS improves simulation quality, data +efficiency, rollout stability, and uncertainty quantification. We conclude that +EPNS is a promising method for efficient and effective data-driven +probabilistic simulation in a diverse range of domains. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Generalizing Graph Transformers Across Diverse Graphs and Tasks via + Pre-Training on Industrial-Scale Data + + +
+ Graph pre-training has been concentrated on graph-level on small graphs +(e.g., molecular graphs) or learning node representations on a fixed graph. +Extending graph pre-trained models to web-scale graphs with billions of nodes +in industrial scenarios, while avoiding negative transfer across graphs or +tasks, remains a challenge. We aim to develop a general graph pre-trained model +with inductive ability that can make predictions for unseen new nodes and even +new graphs. In this work, we introduce a scalable transformer-based graph +pre-training framework called PGT (Pre-trained Graph Transformer). +Specifically, we design a flexible and scalable graph transformer as the +backbone network. Meanwhile, based on the masked autoencoder architecture, we +design two pre-training tasks: one for reconstructing node features and the +other one for reconstructing local structures. Unlike the original autoencoder +architecture where the pre-trained decoder is discarded, we propose a novel +strategy that utilizes the decoder for feature augmentation. We have deployed +our framework on Tencent's online game data. Extensive experiments have +demonstrated that our framework can perform pre-training on real-world +web-scale graphs with over 540 million nodes and 12 billion edges and +generalizes effectively to unseen new graphs with different downstream tasks. +We further conduct experiments on the publicly available ogbn-papers100M +dataset, which consists of 111 million nodes and 1.6 billion edges. Our +framework achieves state-of-the-art performance on both industrial datasets and +public datasets, while also enjoying scalability and efficiency. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks for Parkinsons Disease Detection ICASSP 2025 + + +
+ Despite the promising performance of state of the art approaches for +Parkinsons Disease (PD) detection, these approaches often analyze individual +speech segments in isolation, which can lead to suboptimal results. Dysarthric +cues that characterize speech impairments from PD patients are expected to be +related across segments from different speakers. Isolated segment analysis +fails to exploit these inter segment relationships. Additionally, not all +speech segments from PD patients exhibit clear dysarthric symptoms, introducing +label noise that can negatively affect the performance and generalizability of +current approaches. To address these challenges, we propose a novel PD +detection framework utilizing Graph Convolutional Networks (GCNs). By +representing speech segments as nodes and capturing the similarity between +segments through edges, our GCN model facilitates the aggregation of dysarthric +cues across the graph, effectively exploiting segment relationships and +mitigating the impact of label noise. Experimental results demonstrate +theadvantages of the proposed GCN model for PD detection and provide insights +into its underlying mechanisms + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ Efficient Estimation of the Central Mean Subspace via Smoothed Gradient + Outer Products + + +
+ We consider the problem of sufficient dimension reduction (SDR) for +multi-index models. The estimators of the central mean subspace in prior works +either have slow (non-parametric) convergence rates, or rely on stringent +distributional conditions (e.g., the covariate distribution $P_{\mathbf{X}}$ +being elliptical symmetric). In this paper, we show that a fast parametric +convergence rate of form $C_d \cdot n^{-1/2}$ is achievable via estimating the +\emph{expected smoothed gradient outer product}, for a general class of +distribution $P_{\mathbf{X}}$ admitting Gaussian or heavier distributions. When +the link function is a polynomial with a degree of at most $r$ and +$P_{\mathbf{X}}$ is the standard Gaussian, we show that the prefactor depends +on the ambient dimension $d$ as $C_d \propto d^r$. + +
+
+
+
+
+ + ♻ ☆ Sub-SA: Strengthen In-context Learning via Submodular Selective + Annotation ECAI 2024 + + +
+ In-context learning (ICL) leverages in-context examples as prompts for the +predictions of Large Language Models (LLMs). These prompts play a crucial role +in achieving strong performance. However, the selection of suitable prompts +from a large pool of labeled examples often entails significant annotation +costs. To address this challenge, we propose Sub-SA (Submodular Selective +Annotation), a submodule-based selective annotation method. The aim of Sub-SA +is to reduce annotation costs while improving the quality of in-context +examples and minimizing the time consumption of the selection process. In +Sub-SA, we design a submodular function that facilitates effective subset +selection for annotation and demonstrates the characteristics of monotonically +and submodularity from the theoretical perspective. Specifically, we propose +RPR (Reward and Penalty Regularization) to better balance the diversity and +representativeness of the unlabeled dataset attributed to a reward term and a +penalty term, respectively. Consequently, the selection for annotations can be +effectively addressed with a simple yet effective greedy search algorithm based +on the submodular function. Finally, we apply the similarity prompt retrieval +to get the examples for ICL. + +
+
+ comment: Accepted by ECAI 2024 +
+
+
+
+
+ + ♻ ☆ FRAC-Q-Learning: A Reinforcement Learning with Boredom Avoidance + Processes for Social Robots + + +
+ The reinforcement learning algorithms have often been applied to social +robots. However, most reinforcement learning algorithms were not optimized for +the use of social robots, and consequently they may bore users. We proposed a +new reinforcement learning method specialized for the social robot, the +FRAC-Q-learning, that can avoid user boredom. The proposed algorithm consists +of a forgetting process in addition to randomizing and categorizing processes. +This study evaluated interest and boredom hardness scores of the +FRAC-Q-learning by a comparison with the traditional Q-learning. The +FRAC-Q-learning showed significantly higher trend of interest score, and +indicated significantly harder to bore users compared to the traditional +Q-learning. Therefore, the FRAC-Q-learning can contribute to develop a social +robot that will not bore users. The proposed algorithm has a potential to apply +for Web-based communication and educational systems. This paper presents the +entire process, detailed implementation and a detailed evaluation method of the +of the FRAC-Q-learning for the first time. + +
+
+
+
+
+ + ♻ ☆ Explainable Machine Learning for ICU Readmission Prediction + + +
+ The intensive care unit (ICU) comprises a complex hospital environment, where +decisions made by clinicians have a high level of risk for the patients' lives. +A comprehensive care pathway must then be followed to reduce p complications. +Uncertain, competing and unplanned aspects within this environment increase the +difficulty in uniformly implementing the care pathway. Readmission contributes +to this pathway's difficulty, occurring when patients are admitted again to the +ICU in a short timeframe, resulting in high mortality rates and high resource +utilisation. Several works have tried to predict readmission through patients' +medical information. Although they have some level of success while predicting +readmission, those works do not properly assess, characterise and understand +readmission prediction. This work proposes a standardised and explainable +machine learning pipeline to model patient readmission on a multicentric +database (i.e., the eICU cohort with 166,355 patients, 200,859 admissions and +6,021 readmissions) while validating it on monocentric (i.e., the MIMIC IV +cohort with 382,278 patients, 523,740 admissions and 5,984 readmissions) and +multicentric settings. Our machine learning pipeline achieved predictive +performance in terms of the area of the receiver operating characteristic curve +(AUC) up to 0.7 with a Random Forest classification model, yielding an overall +good calibration and consistency on validation sets. From explanations provided +by the constructed models, we could also derive a set of insightful +conclusions, primarily on variables related to vital signs and blood tests +(e.g., albumin, blood urea nitrogen and hemoglobin levels), demographics (e.g., +age, and admission height and weight), and ICU-associated variables (e.g., unit +type). These insights provide an invaluable source of information during +clinicians' decision-making while discharging ICU patients. + +
+
+
+
+
+ + ♻ ☆ MoDeGPT: Modular Decomposition for Large Language Model Compression + + +
+ Large Language Models (LLMs) have reshaped the landscape of artificial +intelligence by demonstrating exceptional performance across various tasks. +However, substantial computational requirements make their deployment +challenging on devices with limited resources. Recently, compression methods +using low-rank matrix techniques have shown promise, yet these often lead to +degraded accuracy or introduce significant overhead in parameters and inference +latency. This paper introduces \textbf{Mo}dular \textbf{De}composition +(MoDeGPT), a novel structured compression framework that does not need recovery +fine-tuning while resolving the above drawbacks. MoDeGPT partitions the +Transformer block into modules comprised of matrix pairs and reduces the hidden +dimensions via reconstructing the module-level outputs. MoDeGPT is developed +based on a theoretical framework that utilizes three well-established matrix +decomposition algorithms -- Nystr\"om approximation, CR decomposition, and SVD +-- and applies them to our redefined transformer modules. Our comprehensive +experiments show MoDeGPT, without backward propagation, matches or surpasses +previous structured compression methods that rely on gradient information, and +saves 98% of compute costs on compressing a 13B model. On \textsc{Llama}-2/3 +and OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30% +compression rates. Moreover, the compression can be done on a single GPU within +a few hours and increases the inference throughput by up to 46%. + +
+
+ comment: 31 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey on Deep Multimodal Learning with Missing Modality + + +
+ During multimodal model training and reasoning, data samples may miss certain +modalities and lead to compromised model performance due to sensor limitations, +cost constraints, privacy concerns, data loss, and temporal and spatial +factors. This survey provides an overview of recent progress in Multimodal +Learning with Missing Modality (MLMM), focusing on deep learning techniques. It +is the first comprehensive survey that covers the historical background and the +distinction between MLMM and standard multimodal learning setups, followed by a +detailed analysis of current MLMM methods, applications, and datasets, +concluding with a discussion about challenges and potential future directions +in the field. + +
+
+ comment: Work in progress; open to discussion; planning to submit to ACM CSUR + in September +
+
+
+
+
+ + ♻ ☆ Distribution-Free Fair Federated Learning with Small Samples + + +
+ As federated learning gains increasing importance in real-world applications +due to its capacity for decentralized data training, addressing fairness +concerns across demographic groups becomes critically important. However, most +existing machine learning algorithms for ensuring fairness are designed for +centralized data environments and generally require large-sample and +distributional assumptions, underscoring the urgent need for fairness +techniques adapted for decentralized and heterogeneous systems with +finite-sample and distribution-free guarantees. To address this issue, this +paper introduces FedFaiREE, a post-processing algorithm developed specifically +for distribution-free fair learning in decentralized settings with small +samples. Our approach accounts for unique challenges in decentralized +environments, such as client heterogeneity, communication costs, and small +sample sizes. We provide rigorous theoretical guarantees for both fairness and +accuracy, and our experimental results further provide robust empirical +validation for our proposed method. + +
+
+
+
+
+ + ♻ ☆ Multi-Source Music Generation with Latent Diffusion ICASSP 2025 + + +
+ Most music generation models directly generate a single music mixture. To +allow for more flexible and controllable generation, the Multi-Source Diffusion +Model (MSDM) has been proposed to model music as a mixture of multiple +instrumental sources (e.g. piano, drums, bass, and guitar). Its goal is to use +one single diffusion model to generate mutually-coherent music sources, that +are then mixed to form the music. Despite its capabilities, MSDM is unable to +generate music with rich melodies and often generates empty sounds. Its +waveform diffusion approach also introduces significant Gaussian noise +artifacts that compromise audio quality. In response, we introduce a +Multi-Source Latent Diffusion Model (MSLDM) that employs Variational +Autoencoders (VAEs) to encode each instrumental source into a distinct latent +representation. By training a VAE on all music sources, we efficiently capture +each source's unique characteristics in a "source latent." The source latents +are concatenated and our diffusion model learns this joint latent space. This +approach significantly enhances the total and partial generation of music by +leveraging the VAE's latent compression and noise-robustness. The compressed +source latent also facilitates more efficient generation. Subjective listening +tests and Frechet Audio Distance (FAD) scores confirm that our model +outperforms MSDM, showcasing its practical and enhanced applicability in music +generation systems. We also emphasize that modeling sources is more effective +than direct music mixture modeling. Codes and models are available at +https://github.com/XZWY/MSLDM. Demos are available at +https://xzwy.github.io/MSLDMDemo/. + +
+
+ comment: ICASSP 2025 in Submission +
+
+
+
+
+ + ♻ ☆ A Joint Learning Model with Variational Interaction for Multilingual + Program Translation + + +
+ Programs implemented in various programming languages form the foundation of +software applications. To alleviate the burden of program migration and +facilitate the development of software systems, automated program translation +across languages has garnered significant attention. Previous approaches +primarily focus on pairwise translation paradigms, learning translation between +pairs of languages using bilingual parallel data. However, parallel data is +difficult to collect for some language pairs, and the distribution of program +semantics across languages can shift, posing challenges for pairwise program +translation. In this paper, we argue that jointly learning a unified model to +translate code across multiple programming languages is superior to separately +learning from bilingual parallel data. We propose Variational Interaction for +Multilingual Program Translation~(VIM-PT), a disentanglement-based generative +approach that jointly trains a unified model for multilingual program +translation across multiple languages. VIM-PT disentangles code into +language-shared and language-specific features, using variational inference and +interaction information with a novel lower bound, then achieves program +translation through conditional generation. VIM-PT demonstrates four +advantages: 1) captures language-shared information more accurately from +various implementations and improves the quality of multilingual program +translation, 2) mines and leverages the capability of non-parallel data, 3) +addresses the distribution shift of program semantics across languages, 4) and +serves as a unified model, reducing deployment complexity. + +
+
+ comment: Accepted by the 39th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2024) +
+
+
+
+
+ + ♻ ☆ TimeLDM: Latent Diffusion Model for Unconditional Time Series Generation + + +
+ Time series generation is a crucial research topic in the area of +decision-making systems, which can be particularly important in domains like +autonomous driving, healthcare, and, notably, robotics. Recent approaches focus +on learning in the data space to model time series information. However, the +data space often contains limited observations and noisy features. In this +paper, we propose TimeLDM, a novel latent diffusion model for high-quality time +series generation. TimeLDM is composed of a variational autoencoder that +encodes time series into an informative and smoothed latent content and a +latent diffusion model operating in the latent space to generate latent +information. We evaluate the ability of our method to generate synthetic time +series with simulated and real-world datasets and benchmark the performance +against existing state-of-the-art methods. Qualitatively and quantitatively, we +find that the proposed TimeLDM persistently delivers high-quality generated +time series. For example, TimeLDM achieves new state-of-the-art results on the +simulated benchmarks and an average improvement of 55% in Discriminative score +with all benchmarks. Further studies demonstrate that our method yields more +robust outcomes across various lengths of time series data generation. +Especially, for the Context-FID score and Discriminative score, TimeLDM +realizes significant improvements of 80% and 50%, respectively. The code will +be released after publication. + +
+
+
+
+
+ + ♻ ☆ Patch-Level Training for Large Language Models + + +
+ As Large Language Models (LLMs) achieve remarkable progress in language +understanding and generation, their training efficiency has become a critical +concern. Traditionally, LLMs are trained to predict the next token in a +sequence. Despite the success of token-level training, it suffers from +considerable computational costs due to the need to process an extensive number +of tokens. To mitigate this issue, this paper introduces patch-level training +for LLMs, which reduces the sequence length by compressing multiple tokens into +a single patch. During patch-level training, we feed the language model shorter +sequences of patches and train it to predict the next patch, thereby processing +the majority of the training data at a significantly reduced computational +cost. Following this, the model continues token-level training on the remaining +training data to align with the inference mode. Experiments on a diverse range +of models (370M-2.7B parameters) demonstrate that patch-level training can +reduce overall computational costs to 0.5$\times$, without compromising the +model performance compared to token-level training. Source code: +\url{https://github.com/shaochenze/PatchTrain}. + +
+
+
+
+
+ + ♻ ☆ Meta-Learn Unimodal Signals with Weak Supervision for Multimodal + Sentiment Analysis + + +
+ Multimodal sentiment analysis aims to effectively integrate information from +various sources to infer sentiment, where in many cases there are no +annotations for unimodal labels. Therefore, most works rely on multimodal +labels for training. However, there exists the noisy label problem for the +learning of unimodal signals as multimodal annotations are not always the ideal +substitutes for the unimodal ones, failing to achieve finer optimization for +individual modalities. In this paper, we explore the learning of unimodal +labels under the weak supervision from the annotated multimodal labels. +Specifically, we propose a novel meta uni-label generation (MUG) framework to +address the above problem, which leverages the available multimodal labels to +learn the corresponding unimodal labels by the meta uni-label correction +network (MUCN). We first design a contrastive-based projection module to bridge +the gap between unimodal and multimodal representations, so as to use +multimodal annotations to guide the learning of MUCN. Afterwards, we propose +unimodal and multimodal denoising tasks to train MUCN with explicit supervision +via a bi-level optimization strategy. We then jointly train unimodal and +multimodal learning tasks to extract discriminative unimodal features for +multimodal inference. Experimental results suggest that MUG outperforms +competitive baselines and can learn accurate unimodal labels. + +
+
+
+
+
+ + ♻ ☆ Hardware-Assisted Virtualization of Neural Processing Units for Cloud + Platforms MICRO'24 + + +
+ Cloud platforms today have been deploying hardware accelerators like neural +processing units (NPUs) for powering machine learning (ML) inference services. +To maximize the resource utilization while ensuring reasonable quality of +service, a natural approach is to virtualize NPUs for efficient resource +sharing for multi-tenant ML services. However, virtualizing NPUs for modern +cloud platforms is not easy. This is not only due to the lack of system +abstraction support for NPU hardware, but also due to the lack of architectural +and ISA support for enabling fine-grained dynamic operator scheduling for +virtualized NPUs. + We present Neu10, a holistic NPU virtualization framework. We investigate +virtualization techniques for NPUs across the entire software and hardware +stack. Neu10 consists of (1) a flexible NPU abstraction called vNPU, which +enables fine-grained virtualization of the heterogeneous compute units in a +physical NPU (pNPU); (2) a vNPU resource allocator that enables pay-as-you-go +computing model and flexible vNPU-to-pNPU mappings for improved resource +utilization and cost-effectiveness; (3) an ISA extension of modern NPU +architecture for facilitating fine-grained tensor operator scheduling for +multiple vNPUs. We implement Neu10 based on a production-level NPU simulator. +Our experiments show that Neu10 improves the throughput of ML inference +services by up to 1.4$\times$ and reduces the tail latency by up to +4.6$\times$, while improving the NPU utilization by 1.2$\times$ on average, +compared to state-of-the-art NPU sharing approaches. + +
+
+ comment: Accepted to MICRO'24 +
+
+
+
+
+ + ♻ ☆ An Efficient Approach to Regression Problems with Tensor Neural Networks + + +
+ This paper introduces a tensor neural network (TNN) to address nonparametric +regression problems, leveraging its distinct sub-network structure to +effectively facilitate variable separation and enhance the approximation of +complex, high-dimensional functions. The TNN demonstrates superior performance +compared to conventional Feed-Forward Networks (FFN) and Radial Basis Function +Networks (RBN) in terms of both approximation accuracy and generalization +capacity, even with a comparable number of parameters. A significant innovation +in our approach is the integration of statistical regression and numerical +integration within the TNN framework. This allows for efficient computation of +high-dimensional integrals associated with the regression function and provides +detailed insights into the underlying data structure. Furthermore, we employ +gradient and Laplacian analysis on the regression outputs to identify key +dimensions influencing the predictions, thereby guiding the design of +subsequent experiments. These advancements make TNN a powerful tool for +applications requiring precise high-dimensional data analysis and predictive +modeling. + +
+
+
+
+
+ + ♻ ☆ Wildfire Risk Prediction: A Review + + +
+ Wildfires have significant impacts on global vegetation, wildlife, and +humans. They destroy plant communities and wildlife habitats and contribute to +increased emissions of carbon dioxide, nitrogen oxides, methane, and other +pollutants. The prediction of wildfires relies on various independent variables +combined with regression or machine learning methods. In this technical review, +we describe the options for independent variables, data processing techniques, +models, independent variables collinearity and importance estimation methods, +and model performance evaluation metrics. First, we divide the independent +variables into 4 aspects, including climate and meteorology conditions, +socio-economical factors, terrain and hydrological features, and wildfire +historical records. Second, preprocessing methods are described for different +magnitudes, different spatial-temporal resolutions, and different formats of +data. Third, the collinearity and importance evaluation methods of independent +variables are also considered. Fourth, we discuss the application of +statistical models, traditional machine learning models, and deep learning +models in wildfire risk prediction. In this subsection, compared with other +reviews, this manuscript particularly discusses the evaluation metrics and +recent advancements in deep learning methods. Lastly, addressing the +limitations of current research, this paper emphasizes the need for more +effective deep learning time series forecasting algorithms, the utilization of +three-dimensional data including ground and trunk fuel, extraction of more +accurate historical fire point data, and improved model evaluation metrics. + +
+
+
+
+
+ + ♻ ☆ Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with + Texture Map-Based Weak Supervision + + +
+ Facial wrinkle detection plays a crucial role in cosmetic dermatology. +Precise manual segmentation of facial wrinkles is challenging and +time-consuming, with inherent subjectivity leading to inconsistent results +among graders. To address this issue, we propose two solutions. First, we build +and release the first public facial wrinkle dataset, 'FFHQ-Wrinkle', an +extension of the NVIDIA FFHQ dataset. It includes 1,000 images with human +labels and 50,000 images with automatically generated weak labels. This dataset +could serve as a foundation for the research community to develop advanced +wrinkle detection algorithms. Second, we introduce a simple training strategy +utilizing texture maps, applicable to various segmentation models, to detect +wrinkles across the face. Our two-stage training strategy first pretrain models +on a large dataset with weak labels (N=50k), or masked texture maps generated +through computer vision techniques, without human intervention. We then +finetune the models using human-labeled data (N=1k), which consists of manually +labeled wrinkle masks. The network takes as input a combination of RGB and +masked texture map of the image, comprising four channels, in finetuning. We +effectively combine labels from multiple annotators to minimize subjectivity in +manual labeling. Our strategies demonstrate improved segmentation performance +in facial wrinkle segmentation both quantitatively and visually compared to +existing pretraining methods. The dataset is available at +https://github.com/labhai/ffhq-wrinkle-dataset. + +
+
+
+
+
+ + ♻ ☆ Adapting Machine Learning Diagnostic Models to New Populations Using a + Small Amount of Data: Results from Clinical Neuroscience + + +
+ Machine learning (ML) has shown great promise for revolutionizing a number of +areas, including healthcare. However, it is also facing a reproducibility +crisis, especially in medicine. ML models that are carefully constructed from +and evaluated on a training set might not generalize well on data from +different patient populations or acquisition instrument settings and protocols. +We tackle this problem in the context of neuroimaging of Alzheimer's disease +(AD), schizophrenia (SZ) and brain aging. We develop a weighted empirical risk +minimization approach that optimally combines data from a source group, e.g., +subjects are stratified by attributes such as sex, age group, race and clinical +cohort to make predictions on a target group, e.g., other sex, age group, etc. +using a small fraction (10%) of data from the target group. We apply this +method to multi-source data of 15,363 individuals from 20 neuroimaging studies +to build ML models for diagnosis of AD and SZ, and estimation of brain age. We +found that this approach achieves substantially better accuracy than existing +domain adaptation techniques: it obtains area under curve greater than 0.95 for +AD classification, area under curve greater than 0.7 for SZ classification and +mean absolute error less than 5 years for brain age prediction on all target +groups, achieving robustness to variations of scanners, protocols, and +demographic or clinical characteristics. In some cases, it is even better than +training on all data from the target group, because it leverages the diversity +and size of a larger training set. We also demonstrate the utility of our +models for prognostic tasks such as predicting disease progression in +individuals with mild cognitive impairment. Critically, our brain age +prediction models lead to new clinical insights regarding correlations with +neurophysiological tests. + +
+
+
+
+
+ + ♻ ☆ Integrating Chemical Language and Molecular Graph in Multimodal Fused + Deep Learning for Drug Property Prediction + + +
+ Accurately predicting molecular properties is a challenging but essential +task in drug discovery. Recently, many mono-modal deep learning methods have +been successfully applied to molecular property prediction. However, the +inherent limitation of mono-modal learning arises from relying solely on one +modality of molecular representation, which restricts a comprehensive +understanding of drug molecules and hampers their resilience against data +noise. To overcome the limitations, we construct multimodal deep learning +models to cover different molecular representations. We convert drug molecules +into three molecular representations, SMILES-encoded vectors, ECFP +fingerprints, and molecular graphs. To process the modal information, +Transformer-Encoder, bi-directional gated recurrent units (BiGRU), and graph +convolutional network (GCN) are utilized for feature learning respectively, +which can enhance the model capability to acquire complementary and naturally +occurring bioinformatics information. We evaluated our triple-modal model on +six molecule datasets. Different from bi-modal learning models, we adopt five +fusion methods to capture the specific features and leverage the contribution +of each modal information better. Compared with mono-modal models, our +multimodal fused deep learning (MMFDL) models outperform single models in +accuracy, reliability, and resistance capability against noise. Moreover, we +demonstrate its generalization ability in the prediction of binding constants +for protein-ligand complex molecules in the refined set of PDBbind. The +advantage of the multimodal model lies in its ability to process diverse +sources of data using proper models and suitable fusion methods, which would +enhance the noise resistance of the model while obtaining data diversity. + +
+
+
+
+
+ + ♻ ☆ Explaining Large Language Models Decisions with Shapley Values + + +
+ The emergence of large language models (LLMs) has opened up exciting +possibilities for simulating human behavior and cognitive processes, with +potential applications in various domains, including marketing research and +consumer behavior analysis. However, the validity of utilizing LLMs as +stand-ins for human subjects remains uncertain due to glaring divergences that +suggest fundamentally different underlying processes at play and the +sensitivity of LLM responses to prompt variations. This paper presents a novel +approach based on Shapley values from cooperative game theory to interpret LLM +behavior and quantify the relative contribution of each prompt component to the +model's output. Through two applications - a discrete choice experiment and an +investigation of cognitive biases - we demonstrate how the Shapley value method +can uncover what we term "token noise" effects, a phenomenon where LLM +decisions are disproportionately influenced by tokens providing minimal +informative content. This phenomenon raises concerns about the robustness and +generalizability of insights obtained from LLMs in the context of human +behavior simulation. Our model-agnostic approach extends its utility to +proprietary LLMs, providing a valuable tool for practitioners and researchers +to strategically optimize prompts and mitigate apparent cognitive biases. Our +findings underscore the need for a more nuanced understanding of the factors +driving LLM responses before relying on them as substitutes for human subjects +in survey settings. We emphasize the importance of researchers reporting +results conditioned on specific prompt templates and exercising caution when +drawing parallels between human behavior and LLMs. + +
+
+
+
+
+
+
+
+ + Multimedia 10 + +
+
+
+ + ☆ LLaQo: Towards a Query-Based Coach in Expressive Music Performance + Assessment + + +
+ Research in music understanding has extensively explored composition-level +attributes such as key, genre, and instrumentation through advanced +representations, leading to cross-modal applications using large language +models. However, aspects of musical performance such as stylistic expression +and technique remain underexplored, along with the potential of using large +language models to enhance educational outcomes with customized feedback. To +bridge this gap, we introduce LLaQo, a Large Language Query-based music coach +that leverages audio language modeling to provide detailed and formative +assessments of music performances. We also introduce instruction-tuned +query-response datasets that cover a variety of performance dimensions from +pitch accuracy to articulation, as well as contextual performance understanding +(such as difficulty and performance techniques). Utilizing AudioMAE encoder and +Vicuna-7b LLM backend, our model achieved state-of-the-art (SOTA) results in +predicting teachers' performance ratings, as well as in identifying piece +difficulty and playing techniques. Textual responses from LLaQo was moreover +rated significantly higher compared to other baseline models in a user study +using audio-text matching. Our proposed model can thus provide informative +answers to open-ended questions related to musical performance from audio data. + +
+
+
+
+
+ + ☆ On the Computation of BD-Rate over a Set of Videos for Fair Assessment + of Performance of Learned Video Codecs ICASSP 2025 + + +
+ The Bj{\o}ntegaard Delta (BD) measure is widely employed to evaluate and +quantify the variations in the rate-distortion(RD) performance across different +codecs. Many researchers report the average BD value over multiple videos +within a dataset for different codecs. We claim that the current practice in +the learned video compression community of computing the average BD value over +a dataset based on the average RD curve of multiple videos can lead to +misleading conclusions. We show both by analysis of a simplistic case of linear +RD curves and experimental results with two recent learned video codecs that +averaging RD curves can lead to a single video to disproportionately influence +the average BD value especially when the operating bitrate range of different +codecs do not exactly match. Instead, we advocate for calculating the BD +measure per-video basis, as commonly done by the traditional video compression +community, followed by averaging the individual BD values over videos, to +provide a fair comparison of learned video codecs. Our experimental results +demonstrate that the comparison of two recent learned video codecs is affected +by how we evaluate the average BD measure. + +
+
+ comment: Submitted to IEEE ICASSP 2025 +
+
+
+
+
+ + ☆ Rhythmic Foley: A Framework For Seamless Audio-Visual Alignment In + Video-to-Audio Synthesis + + +
+ Our research introduces an innovative framework for video-to-audio synthesis, +which solves the problems of audio-video desynchronization and semantic loss in +the audio. By incorporating a semantic alignment adapter and a temporal +synchronization adapter, our method significantly improves semantic integrity +and the precision of beat point synchronization, particularly in fast-paced +action sequences. Utilizing a contrastive audio-visual pre-trained encoder, our +model is trained with video and high-quality audio data, improving the quality +of the generated audio. This dual-adapter approach empowers users with enhanced +control over audio semantics and beat effects, allowing the adjustment of the +controller to achieve better results. Extensive experiments substantiate the +effectiveness of our framework in achieving seamless audio-visual alignment. + +
+
+
+
+
+ + ☆ STA-V2A: Video-to-Audio Generation with Semantic and Temporal Alignment ICASSP2025 + + +
+ Visual and auditory perception are two crucial ways humans experience the +world. Text-to-video generation has made remarkable progress over the past +year, but the absence of harmonious audio in generated video limits its broader +applications. In this paper, we propose Semantic and Temporal Aligned +Video-to-Audio (STA-V2A), an approach that enhances audio generation from +videos by extracting both local temporal and global semantic video features and +combining these refined video features with text as cross-modal guidance. To +address the issue of information redundancy in videos, we propose an onset +prediction pretext task for local temporal feature extraction and an attentive +pooling module for global semantic feature extraction. To supplement the +insufficient semantic information in videos, we propose a Latent Diffusion +Model with Text-to-Audio priors initialization and cross-modal guidance. We +also introduce Audio-Audio Align, a new metric to assess audio-temporal +alignment. Subjective and objective metrics demonstrate that our method +surpasses existing Video-to-Audio models in generating audio with better +quality, semantic consistency, and temporal alignment. The ablation experiment +validated the effectiveness of each module. Audio samples are available at +https://y-ren16.github.io/STAV2A. + +
+
+ comment: Submitted to ICASSP2025 +
+
+
+
+
+ + ☆ Confidence Calibration for Audio Captioning Models + + +
+ Systems that automatically generate text captions for audio, images and video +lack a confidence indicator of the relevance and correctness of the generated +sequences. To address this, we build on existing methods of confidence +measurement for text by introduce selective pooling of token probabilities, +which aligns better with traditional correctness measures than conventional +pooling does. Further, we propose directly measuring the similarity between +input audio and text in a shared embedding space. To measure self-consistency, +we adapt semantic entropy for audio captioning, and find that these two methods +align even better than pooling-based metrics with the correctness measure that +calculates acoustic similarity between captions. Finally, we explain why +temperature scaling of confidences improves calibration. + +
+
+
+
+
+ + ♻ ☆ Diverse Neural Audio Embeddings -- Bringing Features back ! ICASSP 2025 + + +
+ With the advent of modern AI architectures, a shift has happened towards +end-to-end architectures. This pivot has led to neural architectures being +trained without domain-specific biases/knowledge, optimized according to the +task. We in this paper, learn audio embeddings via diverse feature +representations, in this case, domain-specific. For the case of audio +classification over hundreds of categories of sound, we learn robust separate +embeddings for diverse audio properties such as pitch, timbre, and neural +representation, along with also learning it via an end-to-end architecture. We +observe handcrafted embeddings, e.g., pitch and timbre-based, although on their +own, are not able to beat a fully end-to-end representation, yet adding these +together with end-to-end embedding helps us, significantly improve performance. +This work would pave the way to bring some domain expertise with end-to-end +models to learn robust, diverse representations, surpassing the performance of +just training end-to-end models. + +
+
+ comment: 6 pages, 1 figure, 2 table, Under Review for 50th IEEE ICASSP 2025, + Hyderabad, India +
+
+
+
+
+ + ♻ ☆ SaMoye: Zero-shot Singing Voice Conversion Model Based on Feature + Disentanglement and Enhancement + + +
+ Singing voice conversion (SVC) aims to convert a singer's voice to another +singer's from a reference audio while keeping the original semantics. However, +existing SVC methods can hardly perform zero-shot due to incomplete feature +disentanglement or dependence on the speaker look-up table. We propose the +first open-source high-quality zero-shot SVC model SaMoye that can convert +singing to human and non-human timbre. SaMoye disentangles the singing voice's +features into content, timbre, and pitch features, where we combine multiple +ASR models and compress the content features to reduce timbre leaks. Besides, +we enhance the timbre features by unfreezing the speaker encoder and mixing the +speaker embedding with top-3 similar speakers. We also establish an +unparalleled large-scale dataset to guarantee zero-shot performance, which +comprises more than 1,815 hours of pure singing voice and 6,367 speakers. We +conduct objective and subjective experiments to find that SaMoye outperforms +other models in zero-shot SVC tasks even under extreme conditions like +converting singing to animals' timbre. The code and weight of SaMoye are +available on https://github.com/CarlWangChina/SaMoye-SVC. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Deepfake Detection: A Comprehensive Survey from the Reliability + Perspective + + +
+ The mushroomed Deepfake synthetic materials circulated on the internet have +raised a profound social impact on politicians, celebrities, and individuals +worldwide. In this survey, we provide a thorough review of the existing +Deepfake detection studies from the reliability perspective. We identify three +reliability-oriented research challenges in the current Deepfake detection +domain: transferability, interpretability, and robustness. Moreover, while +solutions have been frequently addressed regarding the three challenges, the +general reliability of a detection model has been barely considered, leading to +the lack of reliable evidence in real-life usages and even for prosecutions on +Deepfake-related cases in court. We, therefore, introduce a model reliability +study metric using statistical random sampling knowledge and the publicly +available benchmark datasets to review the reliability of the existing +detection models on arbitrary Deepfake candidate suspects. Case studies are +further executed to justify the real-life Deepfake cases including different +groups of victims with the help of the reliably qualified detection models as +reviewed in this survey. Reviews and experiments on the existing approaches +provide informative discussions and future research directions for Deepfake +detection. + +
+
+
+
+
+ + ♻ ☆ Speech Emotion Recognition with ASR Transcripts: A Comprehensive Study + on Word Error Rate and Fusion Techniques + + +
+ Text data is commonly utilized as a primary input to enhance Speech Emotion +Recognition (SER) performance and reliability. However, the reliance on +human-transcribed text in most studies impedes the development of practical SER +systems, creating a gap between in-lab research and real-world scenarios where +Automatic Speech Recognition (ASR) serves as the text source. Hence, this study +benchmarks SER performance using ASR transcripts with varying Word Error Rates +(WERs) from eleven models on three well-known corpora: IEMOCAP, CMU-MOSI, and +MSP-Podcast. Our evaluation includes both text-only and bimodal SER with six +fusion techniques, aiming for a comprehensive analysis that uncovers novel +findings and challenges faced by current SER research. Additionally, we propose +a unified ASR error-robust framework integrating ASR error correction and +modality-gated fusion, achieving lower WER and higher SER results compared to +the best-performing ASR transcript. These findings provide insights into SER +with ASR assistance, especially for real-world applications. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ♻ ☆ Dance-to-Music Generation with Encoder-based Textual Inversion SIGGRAPH + + +
+ The seamless integration of music with dance movements is essential for +communicating the artistic intent of a dance piece. This alignment also +significantly improves the immersive quality of gaming experiences and +animation productions. Although there has been remarkable advancement in +creating high-fidelity music from textual descriptions, current methodologies +mainly focus on modulating overall characteristics such as genre and emotional +tone. They often overlook the nuanced management of temporal rhythm, which is +indispensable in crafting music for dance, since it intricately aligns the +musical beats with the dancers' movements. Recognizing this gap, we propose an +encoder-based textual inversion technique to augment text-to-music models with +visual control, facilitating personalized music generation. Specifically, we +develop dual-path rhythm-genre inversion to effectively integrate the rhythm +and genre of a dance motion sequence into the textual space of a text-to-music +model. Contrary to traditional textual inversion methods, which directly update +text embeddings to reconstruct a single target object, our approach utilizes +separate rhythm and genre encoders to obtain text embeddings for two +pseudo-words, adapting to the varying rhythms and genres. We collect a new +dataset called In-the-wild Dance Videos (InDV) and demonstrate that our +approach outperforms state-of-the-art methods across multiple evaluation +metrics. Furthermore, our method is able to adapt to changes in tempo and +effectively integrates with the inherent text-guided generation capability of +the pre-trained model. Our source code and demo videos are available at +\url{https://github.com/lsfhuihuiff/Dance-to-music_Siggraph_Asia_2024} + +
+
+ comment: 11 pages, 5 figures, SIGGRAPH ASIA 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 62 + +
+
+
+ + ☆ Knowledge Tagging with Large Language Model based Multi-Agent System + + +
+ Knowledge tagging for questions is vital in modern intelligent educational +applications, including learning progress diagnosis, practice question +recommendations, and course content organization. Traditionally, these +annotations have been performed by pedagogical experts, as the task demands not +only a deep semantic understanding of question stems and knowledge definitions +but also a strong ability to link problem-solving logic with relevant knowledge +concepts. With the advent of advanced natural language processing (NLP) +algorithms, such as pre-trained language models and large language models +(LLMs), pioneering studies have explored automating the knowledge tagging +process using various machine learning models. In this paper, we investigate +the use of a multi-agent system to address the limitations of previous +algorithms, particularly in handling complex cases involving intricate +knowledge definitions and strict numerical constraints. By demonstrating its +superior performance on the publicly available math question knowledge tagging +dataset, MathKnowCT, we highlight the significant potential of an LLM-based +multi-agent system in overcoming the challenges that previous methods have +encountered. Finally, through an in-depth discussion of the implications of +automating knowledge tagging, we underscore the promising results of deploying +LLM-based algorithms in educational contexts. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Self-Supervised Inference of Agents in Trustless Environments + + +
+ In this paper, we propose a novel approach where agents can form swarms to +produce high-quality responses effectively. This is accomplished by utilizing +agents capable of data inference and ranking, which can be effectively +implemented using LLMs as response classifiers. We assess existing approaches +for trustless agent inference, define our methodology, estimate practical +parameters, and model various types of malicious agent attacks. Our method +leverages the collective intelligence of swarms, ensuring robust and efficient +decentralized AI inference with better accuracy, security, and reliability. We +show that our approach is an order of magnitude faster than other trustless +inference strategies reaching less than 125 ms validation latency. + +
+
+
+
+
+ + ☆ Rethinking Prompting Strategies for Multi-Label Recognition with Partial + Annotations + + +
+ Vision-language models (VLMs) like CLIP have been adapted for Multi-Label +Recognition (MLR) with partial annotations by leveraging prompt-learning, where +positive and negative prompts are learned for each class to associate their +embeddings with class presence or absence in the shared vision-text feature +space. While this approach improves MLR performance by relying on VLM priors, +we hypothesize that learning negative prompts may be suboptimal, as the +datasets used to train VLMs lack image-caption pairs explicitly focusing on +class absence. To analyze the impact of positive and negative prompt learning +on MLR, we introduce PositiveCoOp and NegativeCoOp, where only one prompt is +learned with VLM guidance while the other is replaced by an embedding vector +learned directly in the shared feature space without relying on the text +encoder. Through empirical analysis, we observe that negative prompts degrade +MLR performance, and learning only positive prompts, combined with learned +negative embeddings (PositiveCoOp), outperforms dual prompt learning +approaches. Moreover, we quantify the performance benefits that prompt-learning +offers over a simple vision-features-only baseline, observing that the baseline +displays strong performance comparable to dual prompt learning approach +(DualCoOp), when the proportion of missing labels is low, while requiring half +the training compute and 16 times fewer parameters + +
+
+
+
+
+ + ☆ Towards Quantifying and Reducing Language Mismatch Effects in + Cross-Lingual Speech Anti-Spoofing + + +
+ The effects of language mismatch impact speech anti-spoofing systems, while +investigations and quantification of these effects remain limited. Existing +anti-spoofing datasets are mainly in English, and the high cost of acquiring +multilingual datasets hinders training language-independent models. We initiate +this work by evaluating top-performing speech anti-spoofing systems that are +trained on English data but tested on other languages, observing notable +performance declines. We propose an innovative approach - Accent-based data +expansion via TTS (ACCENT), which introduces diverse linguistic knowledge to +monolingual-trained models, improving their cross-lingual capabilities. We +conduct experiments on a large-scale dataset consisting of over 3 million +samples, including 1.8 million training samples and nearly 1.2 million testing +samples across 12 languages. The language mismatch effects are preliminarily +quantified and remarkably reduced over 15% by applying the proposed ACCENT. +This easily implementable method shows promise for multilingual and +low-resource language scenarios. + +
+
+ comment: Accepted to the IEEE Spoken Language Technology Workshop (SLT) 2024. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Real or Robotic? Assessing Whether LLMs Accurately Simulate Qualities of + Human Responses in Dialogue + + +
+ Studying and building datasets for dialogue tasks is both expensive and +time-consuming due to the need to recruit, train, and collect data from study +participants. In response, much recent work has sought to use large language +models (LLMs) to simulate both human-human and human-LLM interactions, as they +have been shown to generate convincingly human-like text in many settings. +However, to what extent do LLM-based simulations \textit{actually} reflect +human dialogues? In this work, we answer this question by generating a +large-scale dataset of 100,000 paired LLM-LLM and human-LLM dialogues from the +WildChat dataset and quantifying how well the LLM simulations align with their +human counterparts. Overall, we find relatively low alignment between +simulations and human interactions, demonstrating a systematic divergence along +the multiple textual properties, including style and content. Further, in +comparisons of English, Chinese, and Russian dialogues, we find that models +perform similarly. Our results suggest that LLMs generally perform better when +the human themself writes in a way that is more similar to the LLM's own style. + +
+
+
+
+
+ + ☆ Source2Synth: Synthetic Data Generation and Curation Grounded in Real + Data Sources + + +
+ Large Language Models still struggle in challenging scenarios that leverage +structured data, complex reasoning, or tool usage. In this paper, we propose +Source2Synth: a new method that can be used for teaching LLMs new skills +without relying on costly human annotations. Source2Synth takes as input a +custom data source and produces synthetic data points with intermediate +reasoning steps grounded in real-world sources. Source2Synth improves the +dataset quality by discarding low-quality generations based on their +answerability. We demonstrate the generality of this approach by applying it to +two challenging domains: we test reasoning abilities in multi-hop question +answering (MHQA), and tool usage in tabular question answering (TQA). Our +method improves performance by 25.51% for TQA on WikiSQL and 22.57% for MHQA on +HotPotQA compared to the fine-tuned baselines. + +
+
+
+
+
+ + ☆ LLM Honeypot: Leveraging Large Language Models as Advanced Interactive + Honeypot Systems + + +
+ The rapid evolution of cyber threats necessitates innovative solutions for +detecting and analyzing malicious activity. Honeypots, which are decoy systems +designed to lure and interact with attackers, have emerged as a critical +component in cybersecurity. In this paper, we present a novel approach to +creating realistic and interactive honeypot systems using Large Language Models +(LLMs). By fine-tuning a pre-trained open-source language model on a diverse +dataset of attacker-generated commands and responses, we developed a honeypot +capable of sophisticated engagement with attackers. Our methodology involved +several key steps: data collection and processing, prompt engineering, model +selection, and supervised fine-tuning to optimize the model's performance. +Evaluation through similarity metrics and live deployment demonstrated that our +approach effectively generates accurate and informative responses. The results +highlight the potential of LLMs to revolutionize honeypot technology, providing +cybersecurity professionals with a powerful tool to detect and analyze +malicious activity, thereby enhancing overall security infrastructure. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ What Makes a Maze Look Like a Maze? + + +
+ A unique aspect of human visual understanding is the ability to flexibly +interpret abstract concepts: acquiring lifted rules explaining what they +symbolize, grounding them across familiar and unfamiliar contexts, and making +predictions or reasoning about them. While off-the-shelf vision-language models +excel at making literal interpretations of images (e.g., recognizing object +categories such as tree branches), they still struggle to make sense of such +visual abstractions (e.g., how an arrangement of tree branches may form the +walls of a maze). To address this challenge, we introduce Deep Schema Grounding +(DSG), a framework that leverages explicit structured representations of visual +abstractions for grounding and reasoning. At the core of DSG are +schemas--dependency graph descriptions of abstract concepts that decompose them +into more primitive-level symbols. DSG uses large language models to extract +schemas, then hierarchically grounds concrete to abstract components of the +schema onto images with vision-language models. The grounded schema is used to +augment visual abstraction understanding. We systematically evaluate DSG and +different methods in reasoning on our new Visual Abstractions Dataset, which +consists of diverse, real-world images of abstract concepts and corresponding +question-answer pairs labeled by humans. We show that DSG significantly +improves the abstract visual reasoning performance of vision-language models, +and is a step toward human-aligned understanding of visual abstractions. + +
+
+
+
+
+ + ☆ AudioBERT: Audio Knowledge Augmented Language Model + + +
+ Recent studies have identified that language models, pretrained on text-only +datasets, often lack elementary visual knowledge, \textit{e.g.,} colors of +everyday objects. Motivated by this observation, we ask whether a similar +shortcoming exists in terms of the \textit{auditory} knowledge. To answer this +question, we construct a new dataset called AuditoryBench, which consists of +two novel tasks for evaluating auditory knowledge. Based on our analysis using +the benchmark, we find that language models also suffer from a severe lack of +auditory knowledge. To address this limitation, we propose AudioBERT, a novel +method to augment the auditory knowledge of BERT through a retrieval-based +approach. First, we detect auditory knowledge spans in prompts to query our +retrieval model efficiently. Then, we inject audio knowledge into BERT and +switch on low-rank adaptation for effective adaptation when audio knowledge is +required. Our experiments demonstrate that AudioBERT is quite effective, +achieving superior performance on the AuditoryBench. The dataset and code are +available at \bulurl{https://github.com/HJ-Ok/AudioBERT}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Fine-tuning Large Language Models for Entity Matching + + +
+ Generative large language models (LLMs) are a promising alternative to +pre-trained language models for entity matching due to their high zero-shot +performance and their ability to generalize to unseen entities. Existing +research on using LLMs for entity matching has focused on prompt engineering +and in-context learning. This paper explores the potential of fine-tuning LLMs +for entity matching. We analyze fine-tuning along two dimensions: 1) The +representation of training examples, where we experiment with adding different +types of LLM-generated explanations to the training set, and 2) the selection +and generation of training examples using LLMs. In addition to the matching +performance on the source dataset, we investigate how fine-tuning affects the +model's ability to generalize to other in-domain datasets as well as across +topical domains. Our experiments show that fine-tuning significantly improves +the performance of the smaller models while the results for the larger models +are mixed. Fine-tuning also improves the generalization to in-domain datasets +while hurting cross-domain transfer. We show that adding structured +explanations to the training set has a positive impact on the performance of +three out of four LLMs, while the proposed example selection and generation +methods only improve the performance of Llama 3.1 8B while decreasing the +performance of GPT-4o Mini. + +
+
+ comment: 8 pages, 4 figures. For related code and data, see this + https://github.com/wbsg-uni-mannheim/TailorMatch +
+
+
+
+
+ + ☆ On the Role of Context in Reading Time Prediction + + +
+ We present a new perspective on how readers integrate context during +real-time language comprehension. Our proposals build on surprisal theory, +which posits that the processing effort of a linguistic unit (e.g., a word) is +an affine function of its in-context information content. We first observe that +surprisal is only one out of many potential ways that a contextual predictor +can be derived from a language model. Another one is the pointwise mutual +information (PMI) between a unit and its context, which turns out to yield the +same predictive power as surprisal when controlling for unigram frequency. +Moreover, both PMI and surprisal are correlated with frequency. This means that +neither PMI nor surprisal contains information about context alone. In response +to this, we propose a technique where we project surprisal onto the orthogonal +complement of frequency, yielding a new contextual predictor that is +uncorrelated with frequency. Our experiments show that the proportion of +variance in reading times explained by context is a lot smaller when context is +represented by the orthogonalized predictor. From an interpretability +standpoint, this indicates that previous studies may have overstated the role +that context has in predicting reading times. + +
+
+
+
+
+ + ☆ LLM-POTUS Score: A Framework of Analyzing Presidential Debates with + Large Language Models + + +
+ Large language models have demonstrated remarkable capabilities in natural +language processing, yet their application to political discourse analysis +remains underexplored. This paper introduces a novel approach to evaluating +presidential debate performances using LLMs, addressing the longstanding +challenge of objectively assessing debate outcomes. We propose a framework that +analyzes candidates' "Policies, Persona, and Perspective" (3P) and how they +resonate with the "Interests, Ideologies, and Identity" (3I) of four key +audience groups: voters, businesses, donors, and politicians. Our method +employs large language models to generate the LLM-POTUS Score, a quantitative +measure of debate performance based on the alignment between 3P and 3I. We +apply this framework to analyze transcripts from recent U.S. presidential +debates, demonstrating its ability to provide nuanced, multi-dimensional +assessments of candidate performances. Our results reveal insights into the +effectiveness of different debating strategies and their impact on various +audience segments. This study not only offers a new tool for political analysis +but also explores the potential and limitations of using LLMs as impartial +judges in complex social contexts. In addition, this framework provides +individual citizens with an independent tool to evaluate presidential debate +performances, which enhances democratic engagement and reduces reliance on +potentially biased media interpretations and institutional influence, thereby +strengthening the foundation of informed civic participation. + +
+
+
+
+
+ + ☆ WhisperNER: Unified Open Named Entity and Speech Recognition + + +
+ Integrating named entity recognition (NER) with automatic speech recognition +(ASR) can significantly enhance transcription accuracy and informativeness. In +this paper, we introduce WhisperNER, a novel model that allows joint speech +transcription and entity recognition. WhisperNER supports open-type NER, +enabling recognition of diverse and evolving entities at inference. Building on +recent advancements in open NER research, we augment a large synthetic dataset +with synthetic speech samples. This allows us to train WhisperNER on a large +number of examples with diverse NER tags. During training, the model is +prompted with NER labels and optimized to output the transcribed utterance +along with the corresponding tagged entities. To evaluate WhisperNER, we +generate synthetic speech for commonly used NER benchmarks and annotate +existing ASR datasets with open NER tags. Our experiments demonstrate that +WhisperNER outperforms natural baselines on both out-of-domain open type NER +and supervised finetuning. + +
+
+
+
+
+ + ☆ The Faetar Benchmark: Speech Recognition in a Very Under-Resourced + Language + + +
+ We introduce the Faetar Automatic Speech Recognition Benchmark, a benchmark +corpus designed to push the limits of current approaches to low-resource speech +recognition. Faetar, a Franco-Proven\c{c}al variety spoken primarily in Italy, +has no standard orthography, has virtually no existing textual or speech +resources other than what is included in the benchmark, and is quite different +from other forms of Franco-Proven\c{c}al. The corpus comes from field +recordings, most of which are noisy, for which only 5 hrs have matching +transcriptions, and for which forced alignment is of variable quality. The +corpus contains an additional 20 hrs of unlabelled speech. We report baseline +results from state-of-the-art multilingual speech foundation models with a best +phone error rate of 30.4%, using a pipeline that continues pre-training on the +foundation model using the unlabelled set. + +
+
+
+
+
+ + ☆ The CLC-UKET Dataset: Benchmarking Case Outcome Prediction for the UK + Employment Tribunal + + +
+ This paper explores the intersection of technological innovation and access +to justice by developing a benchmark for predicting case outcomes in the UK +Employment Tribunal (UKET). To address the challenge of extensive manual +annotation, the study employs a large language model (LLM) for automatic +annotation, resulting in the creation of the CLC-UKET dataset. The dataset +consists of approximately 19,000 UKET cases and their metadata. Comprehensive +legal annotations cover facts, claims, precedent references, statutory +references, case outcomes, reasons and jurisdiction codes. Facilitated by the +CLC-UKET data, we examine a multi-class case outcome prediction task in the +UKET. Human predictions are collected to establish a performance reference for +model comparison. Empirical results from baseline models indicate that +finetuned transformer models outperform zero-shot and few-shot LLMs on the UKET +prediction task. The performance of zero-shot LLMs can be enhanced by +integrating task-related information into few-shot examples. We hope that the +CLC-UKET dataset, along with human annotations and empirical findings, can +serve as a valuable benchmark for employment-related dispute resolution. + +
+
+
+
+
+ + ☆ TravelAgent: An AI Assistant for Personalized Travel Planning + + +
+ As global tourism expands and artificial intelligence technology advances, +intelligent travel planning services have emerged as a significant research +focus. Within dynamic real-world travel scenarios with multi-dimensional +constraints, services that support users in automatically creating practical +and customized travel itineraries must address three key objectives: +Rationality, Comprehensiveness, and Personalization. However, existing systems +with rule-based combinations or LLM-based planning methods struggle to fully +satisfy these criteria. To overcome the challenges, we introduce TravelAgent, a +travel planning system powered by large language models (LLMs) designed to +provide reasonable, comprehensive, and personalized travel itineraries grounded +in dynamic scenarios. TravelAgent comprises four modules: Tool-usage, +Recommendation, Planning, and Memory Module. We evaluate TravelAgent's +performance with human and simulated users, demonstrating its overall +effectiveness in three criteria and confirming the accuracy of personalized +recommendations. + +
+
+
+
+
+ + ☆ Enhanced Online Grooming Detection Employing Context Determination and + Message-Level Analysis + + +
+ Online Grooming (OG) is a prevalent threat facing predominately children +online, with groomers using deceptive methods to prey on the vulnerability of +children on social media/messaging platforms. These attacks can have severe +psychological and physical impacts, including a tendency towards +revictimization. Current technical measures are inadequate, especially with the +advent of end-to-end encryption which hampers message monitoring. Existing +solutions focus on the signature analysis of child abuse media, which does not +effectively address real-time OG detection. This paper proposes that OG attacks +are complex, requiring the identification of specific communication patterns +between adults and children. It introduces a novel approach leveraging advanced +models such as BERT and RoBERTa for Message-Level Analysis and a Context +Determination approach for classifying actor interactions, including the +introduction of Actor Significance Thresholds and Message Significance +Thresholds. The proposed method aims to enhance accuracy and robustness in +detecting OG by considering the dynamic and multi-faceted nature of these +attacks. Cross-dataset experiments evaluate the robustness and versatility of +our approach. This paper's contributions include improved detection +methodologies and the potential for application in various scenarios, +addressing gaps in current literature and practices. + +
+
+
+
+
+ + ☆ A corpus-based investigation of pitch contours of monosyllabic words in + conversational Taiwan Mandarin + + +
+ In Mandarin, the tonal contours of monosyllabic words produced in isolation +or in careful speech are characterized by four lexical tones: a high-level tone +(T1), a rising tone (T2), a dipping tone (T3) and a falling tone (T4). However, +in spontaneous speech, the actual tonal realization of monosyllabic words can +deviate significantly from these canonical tones due to intra-syllabic +co-articulation and inter-syllabic co-articulation with adjacent tones. In +addition, Chuang et al. (2024) recently reported that the tonal contours of +disyllabic Mandarin words with T2-T4 tone pattern are co-determined by their +meanings. Following up on their research, we present a corpus-based +investigation of how the pitch contours of monosyllabic words are realized in +spontaneous conversational Mandarin, focusing on the effects of contextual +predictors on the one hand, and the way in words' meanings co-determine pitch +contours on the other hand. We analyze the F0 contours of 3824 tokens of 63 +different word types in a spontaneous Taiwan Mandarin corpus, using the +generalized additive (mixed) model to decompose a given observed pitch contour +into a set of component pitch contours. We show that the tonal context +substantially modify a word's canonical tone. Once the effect of tonal context +is controlled for, T2 and T3 emerge as low flat tones, contrasting with T1 as a +high tone, and with T4 as a high-to-mid falling tone. The neutral tone (T0), +which in standard descriptions, is realized based on the preceding tone, +emerges as a low tone in its own right, modified by the other predictors in the +same way as the standard tones T1, T2, T3, and T4. We also show that word, and +even more so, word sense, co-determine words' F0 contours. Analyses of variable +importance using random forests further supported the substantial effect of +tonal context and an effect of word sense. + +
+
+
+
+
+ + ☆ Learning Rules from KGs Guided by Language Models + + +
+ Advances in information extraction have enabled the automatic construction of +large knowledge graphs (e.g., Yago, Wikidata or Google KG), which are widely +used in many applications like semantic search or data analytics. However, due +to their semi-automatic construction, KGs are often incomplete. Rule learning +methods, concerned with the extraction of frequent patterns from KGs and +casting them into rules, can be applied to predict potentially missing facts. A +crucial step in this process is rule ranking. Ranking of rules is especially +challenging over highly incomplete or biased KGs (e.g., KGs predominantly +storing facts about famous people), as in this case biased rules might fit the +data best and be ranked at the top based on standard statistical metrics like +rule confidence. To address this issue, prior works proposed to rank rules not +only relying on the original KG but also facts predicted by a KG embedding +model. At the same time, with the recent rise of Language Models (LMs), several +works have claimed that LMs can be used as alternative means for KG completion. +In this work, our goal is to verify to which extent the exploitation of LMs is +helpful for improving the quality of rule learning systems. + +
+
+ comment: proof of concept +
+
+
+
+
+ + ☆ FPMT: Enhanced Semi-Supervised Model for Traffic Incident Detection ICPR 2024 + + +
+ For traffic incident detection, the acquisition of data and labels is notably +resource-intensive, rendering semi-supervised traffic incident detection both a +formidable and consequential challenge. Thus, this paper focuses on traffic +incident detection with a semi-supervised learning way. It proposes a +semi-supervised learning model named FPMT within the framework of MixText. The +data augmentation module introduces Generative Adversarial Networks to balance +and expand the dataset. During the mix-up process in the hidden space, it +employs a probabilistic pseudo-mixing mechanism to enhance regularization and +elevate model precision. In terms of training strategy, it initiates with +unsupervised training on all data, followed by supervised fine-tuning on a +subset of labeled data, and ultimately completing the goal of semi-supervised +training. Through empirical validation on four authentic datasets, our FPMT +model exhibits outstanding performance across various metrics. Particularly +noteworthy is its robust performance even in scenarios with low label rates. + +
+
+ comment: 14 pages, 3 figures, accepted by ICPR 2024 +
+
+
+
+
+ + ☆ Online vs Offline: A Comparative Study of First-Party and Third-Party + Evaluations of Social Chatbots + + +
+ This paper explores the efficacy of online versus offline evaluation methods +in assessing conversational chatbots, specifically comparing first-party direct +interactions with third-party observational assessments. By extending a +benchmarking dataset of user dialogs with empathetic chatbots with offline +third-party evaluations, we present a systematic comparison between the +feedback from online interactions and the more detached offline third-party +evaluations. Our results reveal that offline human evaluations fail to capture +the subtleties of human-chatbot interactions as effectively as online +assessments. In comparison, automated third-party evaluations using a GPT-4 +model offer a better approximation of first-party human judgments given +detailed instructions. This study highlights the limitations of third-party +evaluations in grasping the complexities of user experiences and advocates for +the integration of direct interaction feedback in conversational AI evaluation +to enhance system development and user satisfaction. + +
+
+
+
+
+ + ☆ Controllable Synthetic Clinical Note Generation with Privacy Guarantees + + +
+ In the field of machine learning, domain-specific annotated data is an +invaluable resource for training effective models. However, in the medical +domain, this data often includes Personal Health Information (PHI), raising +significant privacy concerns. The stringent regulations surrounding PHI limit +the availability and sharing of medical datasets, which poses a substantial +challenge for researchers and practitioners aiming to develop advanced machine +learning models. In this paper, we introduce a novel method to "clone" datasets +containing PHI. Our approach ensures that the cloned datasets retain the +essential characteristics and utility of the original data without compromising +patient privacy. By leveraging differential-privacy techniques and a novel +fine-tuning task, our method produces datasets that are free from identifiable +information while preserving the statistical properties necessary for model +training. We conduct utility testing to evaluate the performance of machine +learning models trained on the cloned datasets. The results demonstrate that +our cloned datasets not only uphold privacy standards but also enhance model +performance compared to those trained on traditional anonymized datasets. This +work offers a viable solution for the ethical and effective utilization of +sensitive medical data in machine learning, facilitating progress in medical +research and the development of robust predictive models. + +
+
+
+
+
+ + ☆ Full-text Error Correction for Chinese Speech Recognition with Large + Language Model + + +
+ Large Language Models (LLMs) have demonstrated substantial potential for +error correction in Automatic Speech Recognition (ASR). However, most research +focuses on utterances from short-duration speech recordings, which are the +predominant form of speech data for supervised ASR training. This paper +investigates the effectiveness of LLMs for error correction in full-text +generated by ASR systems from longer speech recordings, such as transcripts +from podcasts, news broadcasts, and meetings. First, we develop a Chinese +dataset for full-text error correction, named ChFT, utilizing a pipeline that +involves text-to-speech synthesis, ASR, and error-correction pair extractor. +This dataset enables us to correct errors across contexts, including both +full-text and segment, and to address a broader range of error types, such as +punctuation restoration and inverse text normalization, thus making the +correction process comprehensive. Second, we fine-tune a pre-trained LLM on the +constructed dataset using a diverse set of prompts and target formats, and +evaluate its performance on full-text error correction. Specifically, we design +prompts based on full-text and segment, considering various output formats, +such as directly corrected text and JSON-based error-correction pairs. Through +various test settings, including homogeneous, up-to-date, and hard test sets, +we find that the fine-tuned LLMs perform well in the full-text setting with +different prompts, each presenting its own strengths and weaknesses. This +establishes a promising baseline for further research. The dataset is available +on the website. + +
+
+
+
+
+ + ☆ Stable Language Model Pre-training by Reducing Embedding Variability + + +
+ Stable pre-training is essential for achieving better-performing language +models. However, tracking pre-training stability by calculating gradient +variance at every step is impractical due to the significant computational +costs. We explore Token Embedding Variability (TEV) as a simple and efficient +proxy for assessing pre-training stability in language models with pre-layer +normalization, given that shallower layers are more prone to gradient explosion +(section 2.2). Moreover, we propose Multi-head Low-Rank Attention (MLRA) as an +architecture to alleviate such instability by limiting the exponential growth +of output embedding variance, thereby preventing the gradient explosion +(section 3.2). Empirical results on GPT-2 with MLRA demonstrate increased +stability and lower perplexity, particularly in deeper models. + +
+
+
+
+
+ + ☆ Supporting Online Discussions: Integrating AI Into the adhocracy+ + Participation Platform To Enhance Deliberation + + +
+ Online spaces allow people to discuss important issues and make joint +decisions, regardless of their location or time zone. However, without proper +support and thoughtful design, these discussions often lack structure and +politeness during the exchanges of opinions. Artificial intelligence (AI) +represents an opportunity to support both participants and organizers of +large-scale online participation processes. In this paper, we present an +extension of adhocracy+, a large-scale open source participation platform, that +provides two additional debate modules that are supported by AI to enhance the +discussion quality and participant interaction. + +
+
+
+
+
+ + ☆ Top-down Activity Representation Learning for Video Question Answering + + +
+ Capturing complex hierarchical human activities, from atomic actions (e.g., +picking up one present, moving to the sofa, unwrapping the present) to +contextual events (e.g., celebrating Christmas) is crucial for achieving +high-performance video question answering (VideoQA). Recent works have expanded +multimodal models (e.g., CLIP, LLaVA) to process continuous video sequences, +enhancing the model's temporal reasoning capabilities. However, these +approaches often fail to capture contextual events that can be decomposed into +multiple atomic actions non-continuously distributed over relatively long-term +sequences. In this paper, to leverage the spatial visual context representation +capability of the CLIP model for obtaining non-continuous visual +representations in terms of contextual events in videos, we convert long-term +video sequences into a spatial image domain and finetune the multimodal model +LLaVA for the VideoQA task. Our approach achieves competitive performance on +the STAR task, in particular, with a 78.4% accuracy score, exceeding the +current state-of-the-art score by 2.8 points on the NExTQA task. + +
+
+ comment: presented at MIRU2024 +
+
+
+
+
+ + ☆ Multi-object event graph representation learning for Video Question + Answering + + +
+ Video question answering (VideoQA) is a task to predict the correct answer to +questions posed about a given video. The system must comprehend spatial and +temporal relationships among objects extracted from videos to perform causal +and temporal reasoning. While prior works have focused on modeling individual +object movements using transformer-based methods, they falter when capturing +complex scenarios involving multiple objects (e.g., "a boy is throwing a ball +in a hoop"). We propose a contrastive language event graph representation +learning method called CLanG to address this limitation. Aiming to capture +event representations associated with multiple objects, our method employs a +multi-layer GNN-cluster module for adversarial graph representation learning, +enabling contrastive learning between the question text and its relevant +multi-object event graph. Our method outperforms a strong baseline, achieving +up to 2.2% higher accuracy on two challenging VideoQA datasets, NExT-QA and +TGIF-QA-R. In particular, it is 2.8% better than baselines in handling causal +and temporal questions, highlighting its strength in reasoning multiple +object-based events. + +
+
+ comment: presented at MIRU2024 +
+
+
+
+
+ + ☆ Ruri: Japanese General Text Embeddings + + +
+ We report the development of Ruri, a series of Japanese general text +embedding models. While the development of general-purpose text embedding +models in English and multilingual contexts has been active in recent years, +model development in Japanese remains insufficient. The primary reasons for +this are the lack of datasets and the absence of necessary expertise. In this +report, we provide a detailed account of the development process of Ruri. +Specifically, we discuss the training of embedding models using synthesized +datasets generated by LLMs, the construction of the reranker for dataset +filtering and knowledge distillation, and the performance evaluation of the +resulting general-purpose text embedding models. + +
+
+
+
+
+ + ☆ Large Language Models are Pattern Matchers: Editing Semi-Structured and + Structured Documents with ChatGPT + + +
+ Large Language Models (LLMs) offer numerous applications, the full extent of +which is not yet understood. This paper investigates if LLMs can be applied for +editing structured and semi-structured documents with minimal effort. Using a +qualitative research approach, we conduct two case studies with ChatGPT and +thoroughly analyze the results. Our experiments indicate that LLMs can +effectively edit structured and semi-structured documents when provided with +basic, straightforward prompts. ChatGPT demonstrates a strong ability to +recognize and process the structure of annotated documents. This suggests that +explicitly structuring tasks and data in prompts might enhance an LLM's ability +to understand and solve tasks. Furthermore, the experiments also reveal +impressive pattern matching skills in ChatGPT. This observation deserves +further investigation, as it may contribute to understanding the processes +leading to hallucinations in LLMs. + +
+
+
+
+
+ + ☆ Experimenting with Legal AI Solutions: The Case of Question-Answering + for Access to Justice ICML 2024 + + +
+ Generative AI models, such as the GPT and Llama series, have significant +potential to assist laypeople in answering legal questions. However, little +prior work focuses on the data sourcing, inference, and evaluation of these +models in the context of laypersons. To this end, we propose a human-centric +legal NLP pipeline, covering data sourcing, inference, and evaluation. We +introduce and release a dataset, LegalQA, with real and specific legal +questions spanning from employment law to criminal law, corresponding answers +written by legal experts, and citations for each answer. We develop an +automatic evaluation protocol for this dataset, then show that +retrieval-augmented generation from only 850 citations in the train set can +match or outperform internet-wide retrieval, despite containing 9 orders of +magnitude less data. Finally, we propose future directions for open-sourced +efforts, which fall behind closed-sourced models. + +
+
+ comment: Accepted into GenLaw '24 (ICML 2024 workshop) +
+
+
+
+
+ + ☆ DSBench: How Far Are Data Science Agents to Becoming Data Science + Experts? + + +
+ Large Language Models (LLMs) and Large Vision-Language Models (LVLMs) have +demonstrated impressive language/vision reasoning abilities, igniting the +recent trend of building agents for targeted applications such as shopping +assistants or AI software engineers. Recently, many data science benchmarks +have been proposed to investigate their performance in the data science domain. +However, existing data science benchmarks still fall short when compared to +real-world data science applications due to their simplified settings. To +bridge this gap, we introduce DSBench, a comprehensive benchmark designed to +evaluate data science agents with realistic tasks. This benchmark includes 466 +data analysis tasks and 74 data modeling tasks, sourced from Eloquence and +Kaggle competitions. DSBench offers a realistic setting by encompassing long +contexts, multimodal task backgrounds, reasoning with large data files and +multi-table structures, and performing end-to-end data modeling tasks. Our +evaluation of state-of-the-art LLMs, LVLMs, and agents shows that they struggle +with most tasks, with the best agent solving only 34.12% of data analysis tasks +and achieving a 34.74% Relative Performance Gap (RPG). These findings +underscore the need for further advancements in developing more practical, +intelligent, and autonomous data science agents. + +
+
+
+
+
+ + ☆ Enhancing Q&A Text Retrieval with Ranking Models: Benchmarking, + fine-tuning and deploying Rerankers for RAG CIKM 2024 + + +
+ Ranking models play a crucial role in enhancing overall accuracy of text +retrieval systems. These multi-stage systems typically utilize either dense +embedding models or sparse lexical indices to retrieve relevant passages based +on a given query, followed by ranking models that refine the ordering of the +candidate passages by its relevance to the query. + This paper benchmarks various publicly available ranking models and examines +their impact on ranking accuracy. We focus on text retrieval for +question-answering tasks, a common use case for Retrieval-Augmented Generation +systems. Our evaluation benchmarks include models some of which are +commercially viable for industrial applications. + We introduce a state-of-the-art ranking model, NV-RerankQA-Mistral-4B-v3, +which achieves a significant accuracy increase of ~14% compared to pipelines +with other rerankers. We also provide an ablation study comparing the +fine-tuning of ranking models with different sizes, losses and self-attention +mechanisms. + Finally, we discuss challenges of text retrieval pipelines with ranking +models in real-world industry applications, in particular the trade-offs among +model size, ranking accuracy and system requirements like indexing and serving +latency / throughput. + +
+
+ comment: Accepted for the 1st Workshop on GenAI and RAG Systems for Enterprise + @ CIKM 2024 +
+
+
+
+
+ + ☆ An Unsupervised Dialogue Topic Segmentation Model Based on Utterance + Rewriting + + +
+ Dialogue topic segmentation plays a crucial role in various types of dialogue +modeling tasks. The state-of-the-art unsupervised DTS methods learn topic-aware +discourse representations from conversation data through adjacent discourse +matching and pseudo segmentation to further mine useful clues in unlabeled +conversational relations. However, in multi-round dialogs, discourses often +have co-references or omissions, leading to the fact that direct use of these +discourses for representation learning may negatively affect the semantic +similarity computation in the neighboring discourse matching task. In order to +fully utilize the useful cues in conversational relations, this study proposes +a novel unsupervised dialog topic segmentation method that combines the +Utterance Rewriting (UR) technique with an unsupervised learning algorithm to +efficiently utilize the useful cues in unlabeled dialogs by rewriting the +dialogs in order to recover the co-referents and omitted words. Compared with +existing unsupervised models, the proposed Discourse Rewriting Topic +Segmentation Model (UR-DTS) significantly improves the accuracy of topic +segmentation. The main finding is that the performance on DialSeg711 improves +by about 6% in terms of absolute error score and WD, achieving 11.42% in terms +of absolute error score and 12.97% in terms of WD. on Doc2Dial the absolute +error score and WD improves by about 3% and 2%, respectively, resulting in SOTA +reaching 35.17% in terms of absolute error score and 38.49% in terms of WD. +This shows that the model is very effective in capturing the nuances of +conversational topics, as well as the usefulness and challenges of utilizing +unlabeled conversations. + +
+
+ comment: in Chinese language +
+
+
+
+
+ + ♻ ☆ Reasoning Abilities of Large Language Models: In-Depth Analysis on the + Abstraction and Reasoning Corpus + + +
+ The existing methods for evaluating the inference abilities of Large Language +Models (LLMs) have been results-centric, making it difficult to assess the +inference process. We introduce a new approach using the Abstraction and +Reasoning Corpus (ARC) dataset to evaluate the inference and contextual +understanding abilities of large language models in a process-centric manner. +ARC demands rigorous logical structures for problem-solving, making it a +benchmark that facilitates the comparison of model inference abilities with +humans. Experimental results confirm that while large language models possess +weak inference abilities, they still lag in terms of logical coherence, +compositionality, and productivity. Our experiments highlight the reasoning +capabilities of LLMs, proposing development paths for achieving human-level +reasoning. + +
+
+
+
+
+ + ♻ ☆ Assessing Adversarial Robustness of Large Language Models: An Empirical + Study KDD 2024 + + +
+ Large Language Models (LLMs) have revolutionized natural language processing, +but their robustness against adversarial attacks remains a critical concern. We +presents a novel white-box style attack approach that exposes vulnerabilities +in leading open-source LLMs, including Llama, OPT, and T5. We assess the impact +of model size, structure, and fine-tuning strategies on their resistance to +adversarial perturbations. Our comprehensive evaluation across five diverse +text classification tasks establishes a new benchmark for LLM robustness. The +findings of this study have far-reaching implications for the reliable +deployment of LLMs in real-world applications and contribute to the advancement +of trustworthy AI systems. + +
+
+ comment: Oral presentation at KDD 2024 GenAI Evaluation workshop +
+
+
+
+
+ + ♻ ☆ Enabling Language Models to Implicitly Learn Self-Improvement ICLR 2024 + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +open-ended text generation tasks. However, the inherent open-ended nature of +these tasks implies that there is always room for improvement in the quality of +model responses. To address this challenge, various approaches have been +proposed to enhance the performance of LLMs. There has been a growing focus on +enabling LLMs to self-improve their response quality, thereby reducing the +reliance on extensive human annotation efforts for collecting diverse and +high-quality training data. Recently, prompting-based methods have been widely +explored among self-improvement methods owing to their effectiveness, +efficiency, and convenience. However, those methods usually require explicitly +and thoroughly written rubrics as inputs to LLMs. It is expensive and +challenging to manually derive and provide all necessary rubrics with a +real-world complex goal for improvement (e.g., being more helpful and less +harmful). To this end, we propose an ImPlicit Self-ImprovemenT (PIT) framework +that implicitly learns the improvement goal from human preference data. PIT +only requires preference data that are used to train reward models without +extra human efforts. Specifically, we reformulate the training objective of +reinforcement learning from human feedback (RLHF) -- instead of maximizing +response quality for a given input, we maximize the quality gap of the response +conditioned on a reference response. In this way, PIT is implicitly trained +with the improvement goal of better aligning with human preferences. +Experiments on two real-world datasets and one synthetic dataset show that our +method significantly outperforms prompting-based methods. + +
+
+ comment: Accepted at ICLR 2024. 28 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Sentiment Analysis Dataset in Moroccan Dialect: Bridging the Gap Between + Arabic and Latin Scripted dialect + + +
+ Sentiment analysis, the automated process of determining emotions or opinions +expressed in text, has seen extensive exploration in the field of natural +language processing. However, one aspect that has remained underrepresented is +the sentiment analysis of the Moroccan dialect, which boasts a unique +linguistic landscape and the coexistence of multiple scripts. Previous works in +sentiment analysis primarily targeted dialects employing Arabic script. While +these efforts provided valuable insights, they may not fully capture the +complexity of Moroccan web content, which features a blend of Arabic and Latin +script. As a result, our study emphasizes the importance of extending sentiment +analysis to encompass the entire spectrum of Moroccan linguistic diversity. +Central to our research is the creation of the largest public dataset for +Moroccan dialect sentiment analysis that incorporates not only Moroccan dialect +written in Arabic script but also in Latin letters. By assembling a diverse +range of textual data, we were able to construct a dataset with a range of 20 +000 manually labeled text in Moroccan dialect and also publicly available lists +of stop words in Moroccan dialect. To dive into sentiment analysis, we +conducted a comparative study on multiple Machine learning models to assess +their compatibility with our dataset. Experiments were performed using both raw +and preprocessed data to show the importance of the preprocessing step. We were +able to achieve 92% accuracy in our model and to further prove its liability we +tested our model on smaller publicly available datasets of Moroccan dialect and +the results were favorable. + +
+
+ comment: Lang Resources & Evaluation (2024) +
+
+
+
+
+ + ♻ ☆ Demystifying Prompts in Language Models via Perplexity Estimation EMNLP 2023 + + +
+ Language models can be prompted to perform a wide variety of zero- and +few-shot learning problems. However, performance varies significantly with the +choice of prompt, and we do not yet understand why this happens or how to pick +the best prompts. In this work, we analyze the factors that contribute to this +variance and establish a new empirical hypothesis: the performance of a prompt +is coupled with the extent to which the model is familiar with the language it +contains. Over a wide range of tasks, we show that the lower the perplexity of +the prompt is, the better the prompt is able to perform the task. As a result, +we devise a method for creating prompts: (1) automatically extend a small seed +set of manually written prompts by paraphrasing using GPT3 and backtranslation +and (2) choose the lowest perplexity prompts to get significant gains in +performance. + +
+
+ comment: Published in Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ LLM Stability: A detailed analysis with some surprises + + +
+ LLM (large language model) practitioners commonly notice that outputs can +vary for the same inputs, but we have been unable to find work that evaluates +LLM stability as the main objective. In our study of 6 deterministically +configured LLMs across 8 common tasks with 5 identical runs, we see accuracy +variations up to 10\%. In addition, no LLM consistently delivers repeatable +accuracy across all tasks. We also show examples of variation that are not +normally distributed and compare configurations with zero-shot/few-shot +prompting and fine-tuned examples. To better quantify what is going on, we +introduce metrics focused on stability: TARr@N for the total agreement rate at +N runs over raw output, and TARa@N for total agreement over parsed-out answers. +We suggest that stability metrics be integrated into leader boards and research +results going forward. + +
+
+
+
+
+ + ♻ ☆ Does Liking Yellow Imply Driving a School Bus? Semantic Leakage in + Language Models + + +
+ Despite their wide adoption, the biases and unintended behaviors of language +models remain poorly understood. In this paper, we identify and characterize a +phenomenon never discussed before, which we call semantic leakage, where models +leak irrelevant information from the prompt into the generation in unexpected +ways. We propose an evaluation setting to detect semantic leakage both by +humans and automatically, curate a diverse test suite for diagnosing this +behavior, and measure significant semantic leakage in 13 flagship models. We +also show that models exhibit semantic leakage in languages besides English and +across different settings and generation scenarios. This discovery highlights +yet another type of bias in language models that affects their generation +patterns and behavior. + +
+
+
+
+
+ + ♻ ☆ Large Language Models and Games: A Survey and Roadmap + + +
+ Recent years have seen an explosive increase in research on large language +models (LLMs), and accompanying public engagement on the topic. While starting +as a niche area within natural language processing, LLMs have shown remarkable +potential across a broad range of applications and domains, including games. +This paper surveys the current state of the art across the various applications +of LLMs in and for games, and identifies the different roles LLMs can take +within a game. Importantly, we discuss underexplored areas and promising +directions for future uses of LLMs in games and we reconcile the potential and +limitations of LLMs within the games domain. As the first comprehensive survey +and roadmap at the intersection of LLMs and games, we are hopeful that this +paper will serve as the basis for groundbreaking research and innovation in +this exciting new field. + +
+
+ comment: Accepted for publication at the IEEE Transactions on Games (18 pages, + 6 figures) +
+
+
+
+
+ + ♻ ☆ Parameter Efficient Reinforcement Learning from Human Feedback + + +
+ While Reinforcement Learning from Human Feedback (RLHF) effectively aligns +pretrained Large Language and Vision-Language Models (LLMs, and VLMs) with +human preferences, its computational cost and complexity hamper its wider +adoption. To alleviate some of the computational burden of fine-tuning, +parameter efficient methods, like LoRA were introduced. In this work, we +empirically evaluate the setup of Parameter Efficient Reinforcement Learning +from Human Feedback (PE-RLHF) that leverages LoRA fine-tuning for Reward +Modeling, and Reinforcement Learning. We benchmark the PE-RLHF setup on six +diverse datasets spanning summarization, harmless/helpful response generation, +UI automation, and visual question answering in terms of effectiveness of the +trained models, and the training resources required. Our findings show, for the +first time, that PE-RLHF achieves comparable performance to RLHF, while +significantly reducing training time (up to 90% faster for reward models, and +30% faster for RL), and memory footprint (up to 50% reduction for reward +models, and 27% for RL). We provide comprehensive ablations across LoRA ranks, +and model sizes for both reward modeling and reinforcement learning. By +mitigating the computational burden associated with RLHF, we push for a broader +adoption of PE-RLHF as an alignment technique for LLMs and VLMs. + +
+
+
+
+
+ + ♻ ☆ A Transfer Attack to Image Watermarks + + +
+ Watermark has been widely deployed by industry to detect AI-generated images. +The robustness of such watermark-based detector against evasion attacks in the +white-box and black-box settings is well understood in the literature. However, +the robustness in the no-box setting is much less understood. In this work, we +propose a new transfer evasion attack to image watermark in the no-box setting. +Our transfer attack adds a perturbation to a watermarked image to evade +multiple surrogate watermarking models trained by the attacker itself, and the +perturbed watermarked image also evades the target watermarking model. Our +major contribution is to show that, both theoretically and empirically, +watermark-based AI-generated image detector is not robust to evasion attacks +even if the attacker does not have access to the watermarking model nor the +detection API. + +
+
+
+
+
+ + ♻ ☆ Profiling checkpointing schedules in adjoint ST-AD + + +
+ Checkpointing is a cornerstone of data-flow reversal in adjoint algorithmic +differentiation. Checkpointing is a storage/recomputation trade-off that can be +applied at different levels, one of which being the call tree. We are looking +for good placements of checkpoints onto the call tree of a given application, +to reduce run time and memory footprint of its adjoint. There is no known +optimal solution to this problem other than a combinatorial search on all +placements. We propose a heuristics based on run-time profiling of the adjoint +code. We describe implementation of this profiling tool in an existing +source-transformation AD tool. We demonstrate the interest of this approach on +test cases taken from the MITgcm ocean and atmospheric global circulation +model. We discuss the limitations of our approach and propose directions to +lift them. + +
+
+
+
+
+ + ♻ ☆ GSIFN: A Graph-Structured and Interlaced-Masked Multimodal + Transformer-based Fusion Network for Multimodal Sentiment Analysis + + +
+ Multimodal Sentiment Analysis (MSA) leverages multiple data modals to analyze +human sentiment. Existing MSA models generally employ cutting-edge multimodal +fusion and representation learning-based methods to promote MSA capability. +However, there are two key challenges: (i) in existing multimodal fusion +methods, the decoupling of modal combinations and tremendous parameter +redundancy, lead to insufficient fusion performance and efficiency; (ii) a +challenging trade-off exists between representation capability and +computational overhead in unimodal feature extractors and encoders. Our +proposed GSIFN incorporates two main components to solve these problems: (i) a +graph-structured and interlaced-masked multimodal Transformer. It adopts the +Interlaced Mask mechanism to construct robust multimodal graph embedding, +achieve all-modal-in-one Transformer-based fusion, and greatly reduce the +computational overhead; (ii) a self-supervised learning framework with low +computational overhead and high performance, which utilizes a parallelized LSTM +with matrix memory to enhance non-verbal modal features for unimodal label +generation. Evaluated on the MSA datasets CMU-MOSI, CMU-MOSEI, and CH-SIMS, +GSIFN demonstrates superior performance with significantly lower computational +overhead compared with previous state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ DrugAgent: Explainable Drug Repurposing Agent with Large Language + Model-based Reasoning + + +
+ Drug repurposing offers a promising avenue for accelerating drug development +by identifying new therapeutic potentials of existing drugs. In this paper, we +propose a multi-agent framework to enhance the drug repurposing process using +state-of-the-art machine learning techniques and knowledge integration. Our +framework comprises several specialized agents: an AI Agent trains robust +drug-target interaction (DTI) models; a Knowledge Graph Agent utilizes the +drug-gene interaction database (DGIdb), DrugBank, Comparative Toxicogenomics +Database (CTD), and Search Tool for Interactions of Chemicals (STITCH) to +systematically extract DTIs; and a Search Agent interacts with biomedical +literature to annotate and verify computational predictions. By integrating +outputs from these agents, our system effectively harnesses diverse data +sources, including external databases, to propose viable repurposing +candidates. Preliminary results demonstrate the potential of our approach in +not only predicting drug-disease interactions but also in reducing the time and +cost associated with traditional drug discovery methods. This paper highlights +the scalability of multi-agent systems in biomedical research and their role in +driving innovation in drug repurposing. Our approach not only outperforms +existing methods in predicting drug repurposing potential but also provides +interpretable results, paving the way for more efficient and cost-effective +drug discovery processes. + +
+
+ comment: 18 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ What is the Role of Small Models in the LLM Era: A Survey + + +
+ Large Language Models (LLMs) have made significant progress in advancing +artificial general intelligence (AGI), leading to the development of +increasingly large models such as GPT-4 and LLaMA-405B. However, scaling up +model sizes results in exponentially higher computational costs and energy +consumption, making these models impractical for academic researchers and +businesses with limited resources. At the same time, Small Models (SMs) are +frequently used in practical settings, although their significance is currently +underestimated. This raises important questions about the role of small models +in the era of LLMs, a topic that has received limited attention in prior +research. In this work, we systematically examine the relationship between LLMs +and SMs from two key perspectives: Collaboration and Competition. We hope this +survey provides valuable insights for practitioners, fostering a deeper +understanding of the contribution of small models and promoting more efficient +use of computational resources. The code is available at +https://github.com/tigerchen52/role_of_small_models + +
+
+ comment: a survey paper of small models +
+
+
+
+
+ + ♻ ☆ Minimum projective linearizations of trees in linear time + + +
+ The Minimum Linear Arrangement problem (MLA) consists of finding a mapping +$\pi$ from vertices of a graph to distinct integers that minimizes +$\sum_{\{u,v\}\in E}|\pi(u) - \pi(v)|$. In that setting, vertices are often +assumed to lie on a horizontal line and edges are drawn as semicircles above +said line. For trees, various algorithms are available to solve the problem in +polynomial time in $n=|V|$. There exist variants of the MLA in which the +arrangements are constrained. Iordanskii, and later Hochberg and Stallmann +(HS), put forward $O(n)$-time algorithms that solve the problem when +arrangements are constrained to be planar (also known as one-page book +embeddings). We also consider linear arrangements of rooted trees that are +constrained to be projective (planar embeddings where the root is not covered +by any edge). Gildea and Temperley (GT) sketched an algorithm for projective +arrangements which they claimed runs in $O(n)$ but did not provide any +justification of its cost. In contrast, Park and Levy claimed that GT's +algorithm runs in $O(n \log d_{max})$ where $d_{max}$ is the maximum degree but +did not provide sufficient detail. Here we correct an error in HS's algorithm +for the planar case, show its relationship with the projective case, and derive +simple algorithms for the projective and planar cases that run without a doubt +in $O(n)$ time. + +
+
+ comment: Here we have corrected a mistake we made in the previous version. In + particular, line 7 of Algorithm 3.2 used to say: "For i = 1 to |C_v| ..."; it + should be "For i = 2 to |C_v| ..." (notice the change from 'i=1' to 'i=2') +
+
+
+
+
+ + ♻ ☆ Large Language Models and Cognitive Science: A Comprehensive Review of + Similarities, Differences, and Challenges + + +
+ This comprehensive review explores the intersection of Large Language Models +(LLMs) and cognitive science, examining similarities and differences between +LLMs and human cognitive processes. We analyze methods for evaluating LLMs +cognitive abilities and discuss their potential as cognitive models. The review +covers applications of LLMs in various cognitive fields, highlighting insights +gained for cognitive science research. We assess cognitive biases and +limitations of LLMs, along with proposed methods for improving their +performance. The integration of LLMs with cognitive architectures is examined, +revealing promising avenues for enhancing artificial intelligence (AI) +capabilities. Key challenges and future research directions are identified, +emphasizing the need for continued refinement of LLMs to better align with +human cognition. This review provides a balanced perspective on the current +state and future potential of LLMs in advancing our understanding of both +artificial and human intelligence. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Predictability maximization and the origins of word order harmony + + +
+ We address the linguistic problem of the sequential arrangement of a head and +its dependents from an information theoretic perspective. In particular, we +consider the optimal placement of a head that maximizes the predictability of +the sequence. We assume that dependents are statistically independent given a +head, in line with the open-choice principle and the core assumptions of +dependency grammar. We demonstrate the optimality of harmonic order, i.e., +placing the head last maximizes the predictability of the head whereas placing +the head first maximizes the predictability of dependents. We also show that +postponing the head is the optimal strategy to maximize its predictability +while bringing it forward is the optimal strategy to maximize the +predictability of dependents. We unravel the advantages of the strategy of +maximizing the predictability of the head over maximizing the predictability of +dependents. Our findings shed light on the placements of the head adopted by +real languages or emerging in different kinds of experiments. + +
+
+ comment: Typos corrected; new references added +
+
+
+
+
+ + ♻ ☆ How Easily do Irrelevant Inputs Skew the Responses of Large Language + Models? + + +
+ By leveraging the retrieval of information from external knowledge databases, +Large Language Models (LLMs) exhibit enhanced capabilities for accomplishing +many knowledge-intensive tasks. However, due to the inherent flaws of current +retrieval systems, there might exist irrelevant information within those +retrieving top-ranked passages. In this work, we present a comprehensive +investigation into the robustness of LLMs to different types of irrelevant +information under various conditions. We initially introduce a framework to +construct high-quality irrelevant information that ranges from semantically +unrelated, partially related, and related to questions. Furthermore, our +analysis demonstrates that the constructed irrelevant information not only +scores highly on similarity metrics, being highly retrieved by existing +systems, but also bears semantic connections to the context. Our investigation +reveals that current LLMs still face challenges in discriminating highly +semantically related information and can be easily distracted by these +irrelevant yet misleading content. Besides, we also find that current solutions +for handling irrelevant information have limitations in improving the +robustness of LLMs to such distractions. All the resources are available on +GitHub at https://github.com/Di-viner/LLM-Robustness-to-Irrelevant-Information. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ Linear Adversarial Concept Erasure ICML 2022 + + +
+ Modern neural models trained on textual data rely on pre-trained +representations that emerge without direct supervision. As these +representations are increasingly being used in real-world applications, the +inability to \emph{control} their content becomes an increasingly important +problem. We formulate the problem of identifying and erasing a linear subspace +that corresponds to a given concept, in order to prevent linear predictors from +recovering the concept. We model this problem as a constrained, linear maximin +game, and show that existing solutions are generally not optimal for this task. +We derive a closed-form solution for certain objectives, and propose a convex +relaxation, \method, that works well for others. When evaluated in the context +of binary gender removal, the method recovers a low-dimensional subspace whose +removal mitigates bias by intrinsic and extrinsic evaluation. We show that the +method is highly expressive, effectively mitigating bias in deep nonlinear +classifiers while maintaining tractability and interpretability. + +
+
+ comment: Accepted in ICML 2022; a revised version +
+
+
+
+
+ + ♻ ☆ I Know About "Up"! Enhancing Spatial Reasoning in Visual Language Models + Through 3D Reconstruction + + +
+ Visual Language Models (VLMs) are essential for various tasks, particularly +visual reasoning tasks, due to their robust multi-modal information +integration, visual reasoning capabilities, and contextual awareness. However, +existing \VLMs{}' visual spatial reasoning capabilities are often inadequate, +struggling even with basic tasks such as distinguishing left from right. To +address this, we propose the \ours{} model, designed to enhance the visual +spatial reasoning abilities of VLMS. ZeroVLM employs Zero-1-to-3, a 3D +reconstruction model for obtaining different views of the input images and +incorporates a prompting mechanism to further improve visual spatial reasoning. +Experimental results on four visual spatial reasoning datasets show that our +\ours{} achieves up to 19.48% accuracy improvement, which indicates the +effectiveness of the 3D reconstruction and prompting mechanisms of our ZeroVLM. + +
+
+
+
+
+ + ♻ ☆ Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with + BenchBench + + +
+ Recent advancements in Language Models (LMs) have catalyzed the creation of +multiple benchmarks, designed to assess these models' general capabilities. A +crucial task, however, is assessing the validity of the benchmarks themselves. +This is most commonly done via Benchmark Agreement Testing (BAT), where new +benchmarks are validated against established ones using some agreement metric +(e.g., rank correlation). Despite the crucial role of BAT for benchmark +builders and consumers, there are no standardized procedures for such agreement +testing. This deficiency can lead to invalid conclusions, fostering mistrust in +benchmarks and upending the ability to properly choose the appropriate +benchmark to use. By analyzing over 40 prominent benchmarks, we demonstrate how +some overlooked methodological choices can significantly influence BAT results, +potentially undermining the validity of conclusions. To address these +inconsistencies, we propose a set of best practices for BAT and demonstrate how +utilizing these methodologies greatly improves BAT robustness and validity. To +foster adoption and facilitate future research,, we introduce BenchBench, a +python package for BAT, and release the BenchBench-leaderboard, a +meta-benchmark designed to evaluate benchmarks using their peers. Our findings +underscore the necessity for standardized BAT, ensuring the robustness and +validity of benchmark evaluations in the evolving landscape of language model +research. + BenchBench Package: github.com/IBM/BenchBench + Leaderboard: hf.co/spaces/IBM/BenchBench + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ The SAME score: Improved cosine based bias score for word embeddings + + +
+ With the enourmous popularity of large language models, many researchers have +raised ethical concerns regarding social biases incorporated in such models. +Several methods to measure social bias have been introduced, but apparently +these methods do not necessarily agree regarding the presence or severity of +bias. Furthermore, some works have shown theoretical issues or severe +limitations with certain bias measures. For that reason, we introduce SAME, a +novel bias score for semantic bias in embeddings. We conduct a thorough +theoretical analysis as well as experiments to show its benefits compared to +similar bias scores from the literature. We further highlight a substantial +relation of semantic bias measured by SAME with downstream bias, a connection +that has recently been argued to be negligible. Instead, we show that SAME is +capable of measuring semantic bias and identify potential causes for social +bias in downstream tasks. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Semantic Properties of cosine based bias scores for word embeddings + + +
+ Plenty of works have brought social biases in language models to attention +and proposed methods to detect such biases. As a result, the literature +contains a great deal of different bias tests and scores, each introduced with +the premise to uncover yet more biases that other scores fail to detect. What +severely lacks in the literature, however, are comparative studies that analyse +such bias scores and help researchers to understand the benefits or limitations +of the existing methods. In this work, we aim to close this gap for cosine +based bias scores. By building on a geometric definition of bias, we propose +requirements for bias scores to be considered meaningful for quantifying +biases. Furthermore, we formally analyze cosine based scores from the +literature with regard to these requirements. We underline these findings with +experiments to show that the bias scores' limitations have an impact in the +application case. + +
+
+ comment: 11 pages, 3 figures. arXiv admin note: text overlap with + arXiv:2111.07864 +
+
+
+
+
+ + ♻ ☆ Evaluating Metrics for Bias in Word Embeddings + + +
+ Over the last years, word and sentence embeddings have established as text +preprocessing for all kinds of NLP tasks and improved the performances +significantly. Unfortunately, it has also been shown that these embeddings +inherit various kinds of biases from the training data and thereby pass on +biases present in society to NLP solutions. Many papers attempted to quantify +bias in word or sentence embeddings to evaluate debiasing methods or compare +different embedding models, usually with cosine-based metrics. However, lately +some works have raised doubts about these metrics showing that even though such +metrics report low biases, other tests still show biases. In fact, there is a +great variety of bias metrics or tests proposed in the literature without any +consensus on the optimal solutions. Yet we lack works that evaluate bias +metrics on a theoretical level or elaborate the advantages and disadvantages of +different bias metrics. In this work, we will explore different cosine based +bias metrics. We formalize a bias definition based on the ideas from previous +works and derive conditions for bias metrics. Furthermore, we thoroughly +investigate the existing cosine-based metrics and their limitations to show why +these metrics can fail to report biases in some cases. Finally, we propose a +new metric, SAME, to address the shortcomings of existing metrics and +mathematically prove that SAME behaves appropriately. + +
+
+ comment: 32 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ StyleSinger: Style Transfer for Out-of-Domain Singing Voice Synthesis AAAI 2024 + + +
+ Style transfer for out-of-domain (OOD) singing voice synthesis (SVS) focuses +on generating high-quality singing voices with unseen styles (such as timbre, +emotion, pronunciation, and articulation skills) derived from reference singing +voice samples. However, the endeavor to model the intricate nuances of singing +voice styles is an arduous task, as singing voices possess a remarkable degree +of expressiveness. Moreover, existing SVS methods encounter a decline in the +quality of synthesized singing voices in OOD scenarios, as they rest upon the +assumption that the target vocal attributes are discernible during the training +phase. To overcome these challenges, we propose StyleSinger, the first singing +voice synthesis model for zero-shot style transfer of out-of-domain reference +singing voice samples. StyleSinger incorporates two critical approaches for +enhanced effectiveness: 1) the Residual Style Adaptor (RSA) which employs a +residual quantization module to capture diverse style characteristics in +singing voices, and 2) the Uncertainty Modeling Layer Normalization (UMLN) to +perturb the style attributes within the content representation during the +training phase and thus improve the model generalization. Our extensive +evaluations in zero-shot style transfer undeniably establish that StyleSinger +outperforms baseline models in both audio quality and similarity to the +reference singing voice samples. Access to singing voice samples can be found +at https://stylesinger.github.io/. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ On Leveraging Large Language Models for Enhancing Entity Resolution: A + Cost-efficient Approach + + +
+ Entity resolution, the task of identifying and merging records that refer to +the same real-world entity, is crucial in sectors like e-commerce, healthcare, +and law enforcement. Large Language Models (LLMs) introduce an innovative +approach to this task, capitalizing on their advanced linguistic capabilities +and a ``pay-as-you-go'' model that provides significant advantages to those +without extensive data science expertise. However, current LLMs are costly due +to per-API request billing. Existing methods often either lack quality or +become prohibitively expensive at scale. To address these problems, we propose +an uncertainty reduction framework using LLMs to improve entity resolution +results. We first initialize possible partitions of the entity cluster, refer +to the same entity, and define the uncertainty of the result. Then, we reduce +the uncertainty by selecting a few valuable matching questions for LLM +verification. Upon receiving the answers, we update the probability +distribution of the possible partitions. To further reduce costs, we design an +efficient algorithm to judiciously select the most valuable matching pairs to +query. Additionally, we create error-tolerant techniques to handle LLM mistakes +and a dynamic adjustment method to reach truly correct partitions. Experimental +results show that our method is efficient and effective, offering promising +applications in real-world tasks. + +
+
+ comment: 9 pages, preprint under review +
+
+
+
+
+ + ♻ ☆ RoboUniView: Visual-Language Model with Unified View Representation for + Robotic Manipulation + + +
+ Utilizing Vision-Language Models (VLMs) for robotic manipulation represents a +novel paradigm, aiming to enhance the model's ability to generalize to new +objects and instructions. However, due to variations in camera specifications +and mounting positions, existing methods exhibit significant performance +disparities across different robotic platforms. To address this challenge, we +propose RoboUniView in this paper, an innovative approach that decouples visual +feature extraction from action learning. We first learn a unified view +representation from multi-perspective views by pre-training on readily +accessible data, and then derive actions from this unified view representation +to control robotic manipulation. This unified view representation more +accurately mirrors the physical world and is not constrained by the robotic +platform's camera parameters. Thanks to this methodology, we achieve +state-of-the-art performance on the demanding CALVIN benchmark, enhancing the +success rate in the $D \to D$ setting from 93.0% to 96.2%, and in the $ABC \to +D$ setting from 92.2% to 94.2%. Moreover, our model exhibits outstanding +adaptability and flexibility: it maintains high performance under unseen camera +parameters, can utilize multiple datasets with varying camera parameters, and +is capable of joint cross-task learning across datasets. Code is provided for +re-implementation. https://github.com/liufanfanlff/RoboUniview + +
+
+
+
+
+ + ♻ ☆ Representational Analysis of Binding in Large Language Models + + +
+ Entity tracking is essential for complex reasoning. To perform in-context +entity tracking, language models (LMs) must bind an entity to its attribute +(e.g., bind a container to its content) to recall attribute for a given entity. +For example, given a context mentioning ``The coffee is in Box Z, the stone is +in Box M, the map is in Box H'', to infer ``Box Z contains the coffee'' later, +LMs must bind ``Box Z'' to ``coffee''. To explain the binding behaviour of LMs, +Feng and Steinhardt (2023) introduce a Binding ID mechanism and state that LMs +use a abstract concept called Binding ID (BI) to internally mark +entity-attribute pairs. However, they have not directly captured the BI +determinant information from entity activations. In this work, we provide a +novel view of the Binding ID mechanism by localizing the prototype of BI +information. Specifically, we discover that there exists a low-rank subspace in +the hidden state (or activation) of LMs, that primarily encodes the order of +entity and attribute and which is used as the prototype of BI to causally +determine the binding. To identify this subspace, we choose principle component +analysis as our first attempt and it is empirically proven to be effective. +Moreover, we also discover that when editing representations along directions +in the subspace, LMs tend to bind a given entity to other attributes +accordingly. For example, by patching activations along the BI encoding +direction we can make the LM to infer ``Box Z contains the stone'' and ``Box Z +contains the map''. + +
+
+ comment: The key phrase "BI Subspace" might be misleading, because it sounds + like the subspace that directly encodes BI, and which is different with its + intended meaning that the subspace that is the base (or prototype) of BI. + Therefore, the naming of the subspace and its corresponding wording needs + further discussion and review +
+
+
+
+
+ + ♻ ☆ A Survey of Backdoor Attacks and Defenses on Large Language Models: + Implications for Security Measures + + +
+ Large Language Models (LLMs), which bridge the gap between human language +understanding and complex problem-solving, achieve state-of-the-art performance +on several NLP tasks, particularly in few-shot and zero-shot settings. Despite +the demonstrable efficacy of LLMs, due to constraints on computational +resources, users have to engage with open-source language models or outsource +the entire training process to third-party platforms. However, research has +demonstrated that language models are susceptible to potential security +vulnerabilities, particularly in backdoor attacks. Backdoor attacks are +designed to introduce targeted vulnerabilities into language models by +poisoning training samples or model weights, allowing attackers to manipulate +model responses through malicious triggers. While existing surveys on backdoor +attacks provide a comprehensive overview, they lack an in-depth examination of +backdoor attacks specifically targeting LLMs. To bridge this gap and grasp the +latest trends in the field, this paper presents a novel perspective on backdoor +attacks for LLMs by focusing on fine-tuning methods. Specifically, we +systematically classify backdoor attacks into three categories: full-parameter +fine-tuning, parameter-efficient fine-tuning, and no fine-tuning Based on +insights from a substantial review, we also discuss crucial issues for future +research on backdoor attacks, such as further exploring attack algorithms that +do not require fine-tuning, or developing more covert attack algorithms. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 52 + +
+
+
+ + ☆ 360PanT: Training-Free Text-Driven 360-Degree Panorama-to-Panorama + Translation WACV 2025 + + +
+ Preserving boundary continuity in the translation of 360-degree panoramas +remains a significant challenge for existing text-driven image-to-image +translation methods. These methods often produce visually jarring +discontinuities at the translated panorama's boundaries, disrupting the +immersive experience. To address this issue, we propose 360PanT, a +training-free approach to text-based 360-degree panorama-to-panorama +translation with boundary continuity. Our 360PanT achieves seamless +translations through two key components: boundary continuity encoding and +seamless tiling translation with spatial control. Firstly, the boundary +continuity encoding embeds critical boundary continuity information of the +input 360-degree panorama into the noisy latent representation by constructing +an extended input image. Secondly, leveraging this embedded noisy latent +representation and guided by a target prompt, the seamless tiling translation +with spatial control enables the generation of a translated image with +identical left and right halves while adhering to the extended input's +structure and semantic layout. This process ensures a final translated +360-degree panorama with seamless boundary continuity. Experimental results on +both real-world and synthesized datasets demonstrate the effectiveness of our +360PanT in translating 360-degree panoramas. Code is available at +\href{https://github.com/littlewhitesea/360PanT}{https://github.com/littlewhitesea/360PanT}. + +
+
+ comment: Accepted by WACV 2025, Project Page: + \href{https://littlewhitesea.github.io/360PanT.github.io/}{https://littlewhitesea.github.io/360PanT.github.io/} +
+
+
+
+
+ + ☆ Continual Learning in 3D Point Clouds: Employing Spectral Techniques for + Exemplar Selection + + +
+ We introduce a novel framework for Continual Learning in 3D object +classification (CL3D). Our approach is based on the selection of prototypes +from each class using spectral clustering. For non-Euclidean data such as point +clouds, spectral clustering can be employed as long as one can define a +distance measure between pairs of samples. Choosing the appropriate distance +measure enables us to leverage 3D geometric characteristics to identify +representative prototypes for each class. We explore the effectiveness of +clustering in the input space (3D points), local feature space +(1024-dimensional points), and global feature space. We conduct experiments on +the ModelNet40, ShapeNet, and ScanNet datasets, achieving state-of-the-art +accuracy exclusively through the use of input space features. By leveraging the +combined input, local, and global features, we have improved the +state-of-the-art on ModelNet and ShapeNet, utilizing nearly half the memory +used by competing approaches. For the challenging ScanNet dataset, our method +enhances accuracy by 4.1% while consuming just 28% of the memory used by our +competitors, demonstrating the scalability of our approach. + +
+
+
+
+
+ + ☆ Rethinking Prompting Strategies for Multi-Label Recognition with Partial + Annotations + + +
+ Vision-language models (VLMs) like CLIP have been adapted for Multi-Label +Recognition (MLR) with partial annotations by leveraging prompt-learning, where +positive and negative prompts are learned for each class to associate their +embeddings with class presence or absence in the shared vision-text feature +space. While this approach improves MLR performance by relying on VLM priors, +we hypothesize that learning negative prompts may be suboptimal, as the +datasets used to train VLMs lack image-caption pairs explicitly focusing on +class absence. To analyze the impact of positive and negative prompt learning +on MLR, we introduce PositiveCoOp and NegativeCoOp, where only one prompt is +learned with VLM guidance while the other is replaced by an embedding vector +learned directly in the shared feature space without relying on the text +encoder. Through empirical analysis, we observe that negative prompts degrade +MLR performance, and learning only positive prompts, combined with learned +negative embeddings (PositiveCoOp), outperforms dual prompt learning +approaches. Moreover, we quantify the performance benefits that prompt-learning +offers over a simple vision-features-only baseline, observing that the baseline +displays strong performance comparable to dual prompt learning approach +(DualCoOp), when the proportion of missing labels is low, while requiring half +the training compute and 16 times fewer parameters + +
+
+
+
+
+ + ☆ Learned Compression for Images and Point Clouds + + +
+ Over the last decade, deep learning has shown great success at performing +computer vision tasks, including classification, super-resolution, and style +transfer. Now, we apply it to data compression to help build the next +generation of multimedia codecs. This thesis provides three primary +contributions to this new field of learned compression. First, we present an +efficient low-complexity entropy model that dynamically adapts the encoding +distribution to a specific input by compressing and transmitting the encoding +distribution itself as side information. Secondly, we propose a novel +lightweight low-complexity point cloud codec that is highly specialized for +classification, attaining significant reductions in bitrate compared to +non-specialized codecs. Lastly, we explore how motion within the input domain +between consecutive video frames is manifested in the corresponding +convolutionally-derived latent space. + +
+
+ comment: 65 pages, 21 figures, Master's Thesis, defended in 2023 +
+
+
+
+
+ + ☆ Robust Dual Gaussian Splatting for Immersive Human-centric Volumetric + Videos SIGGRAPH + + +
+ Volumetric video represents a transformative advancement in visual media, +enabling users to freely navigate immersive virtual experiences and narrowing +the gap between digital and real worlds. However, the need for extensive manual +intervention to stabilize mesh sequences and the generation of excessively +large assets in existing workflows impedes broader adoption. In this paper, we +present a novel Gaussian-based approach, dubbed \textit{DualGS}, for real-time +and high-fidelity playback of complex human performance with excellent +compression ratios. Our key idea in DualGS is to separately represent motion +and appearance using the corresponding skin and joint Gaussians. Such an +explicit disentanglement can significantly reduce motion redundancy and enhance +temporal coherence. We begin by initializing the DualGS and anchoring skin +Gaussians to joint Gaussians at the first frame. Subsequently, we employ a +coarse-to-fine training strategy for frame-by-frame human performance modeling. +It includes a coarse alignment phase for overall motion prediction as well as a +fine-grained optimization for robust tracking and high-fidelity rendering. To +integrate volumetric video seamlessly into VR environments, we efficiently +compress motion using entropy encoding and appearance using codec compression +coupled with a persistent codebook. Our approach achieves a compression ratio +of up to 120 times, only requiring approximately 350KB of storage per frame. We +demonstrate the efficacy of our representation through photo-realistic, +free-view experiences on VR headsets, enabling users to immersively watch +musicians in performance and feel the rhythm of the notes at the performers' +fingertips. + +
+
+ comment: Accepted at SIGGRAPH Asia 2024. Project page: + https://nowheretrix.github.io/DualGS/ +
+
+
+
+
+ + ☆ Bayesian Inverse Graphics for Few-Shot Concept Learning + + +
+ Humans excel at building generalizations of new concepts from just one single +example. Contrary to this, current computer vision models typically require +large amount of training samples to achieve a comparable accuracy. In this work +we present a Bayesian model of perception that learns using only minimal data, +a prototypical probabilistic program of an object. Specifically, we propose a +generative inverse graphics model of primitive shapes, to infer posterior +distributions over physically consistent parameters from one or several images. +We show how this representation can be used for downstream tasks such as +few-shot classification and pose estimation. Our model outperforms existing +few-shot neural-only classification algorithms and demonstrates generalization +across varying lighting conditions, backgrounds, and out-of-distribution +shapes. By design, our model is uncertainty-aware and uses our new +differentiable renderer for optimizing global scene parameters through gradient +descent, sampling posterior distributions over object parameters with Markov +Chain Monte Carlo (MCMC), and using a neural based likelihood function. + +
+
+
+
+
+ + ☆ SIG: A Synthetic Identity Generation Pipeline for Generating Evaluation + Datasets for Face Recognition + + +
+ As Artificial Intelligence applications expand, the evaluation of models +faces heightened scrutiny. Ensuring public readiness requires evaluation +datasets, which differ from training data by being disjoint and ethically +sourced in compliance with privacy regulations. The performance and fairness of +face recognition systems depend significantly on the quality and +representativeness of these evaluation datasets. This data is sometimes scraped +from the internet without user's consent, causing ethical concerns that can +prohibit its use without proper releases. In rare cases, data is collected in a +controlled environment with consent, however, this process is time-consuming, +expensive, and logistically difficult to execute. This creates a barrier for +those unable to conjure the immense resources required to gather ethically +sourced evaluation datasets. To address these challenges, we introduce the +Synthetic Identity Generation pipeline, or SIG, that allows for the targeted +creation of ethical, balanced datasets for face recognition evaluation. Our +proposed and demonstrated pipeline generates high-quality images of synthetic +identities with controllable pose, facial features, and demographic attributes, +such as race, gender, and age. We also release an open-source evaluation +dataset named ControlFace10k, consisting of 10,008 face images of 3,336 unique +synthetic identities balanced across race, gender, and age, generated using the +proposed SIG pipeline. We analyze ControlFace10k along with a non-synthetic +BUPT dataset using state-of-the-art face recognition algorithms to demonstrate +its effectiveness as an evaluation tool. This analysis highlights the dataset's +characteristics and its utility in assessing algorithmic bias across different +demographic groups. + +
+
+
+
+
+ + ☆ Digital Volumetric Biopsy Cores Improve Gleason Grading of Prostate + Cancer Using Deep Learning + + +
+ Prostate cancer (PCa) was the most frequently diagnosed cancer among American +men in 2023. The histological grading of biopsies is essential for diagnosis, +and various deep learning-based solutions have been developed to assist with +this task. Existing deep learning frameworks are typically applied to +individual 2D cross-sections sliced from 3D biopsy tissue specimens. This +process impedes the analysis of complex tissue structures such as glands, which +can vary depending on the tissue slice examined. We propose a novel digital +pathology data source called a "volumetric core," obtained via the extraction +and co-alignment of serially sectioned tissue sections using a novel +morphology-preserving alignment framework. We trained an attention-based +multiple-instance learning (ABMIL) framework on deep features extracted from +volumetric patches to automatically classify the Gleason Grade Group (GGG). To +handle volumetric patches, we used a modified video transformer with a deep +feature extractor pretrained using self-supervised learning. We ran our +morphology-preserving alignment framework to construct 10,210 volumetric cores, +leaving out 30% for pretraining. The rest of the dataset was used to train +ABMIL, which resulted in a 0.958 macro-average AUC, 0.671 F1 score, 0.661 +precision, and 0.695 recall averaged across all five GGG significantly +outperforming the 2D baselines. + +
+
+
+
+
+ + ☆ DreamHOI: Subject-Driven Generation of 3D Human-Object Interactions with + Diffusion Priors + + +
+ We present DreamHOI, a novel method for zero-shot synthesis of human-object +interactions (HOIs), enabling a 3D human model to realistically interact with +any given object based on a textual description. This task is complicated by +the varying categories and geometries of real-world objects and the scarcity of +datasets encompassing diverse HOIs. To circumvent the need for extensive data, +we leverage text-to-image diffusion models trained on billions of image-caption +pairs. We optimize the articulation of a skinned human mesh using Score +Distillation Sampling (SDS) gradients obtained from these models, which predict +image-space edits. However, directly backpropagating image-space gradients into +complex articulation parameters is ineffective due to the local nature of such +gradients. To overcome this, we introduce a dual implicit-explicit +representation of a skinned mesh, combining (implicit) neural radiance fields +(NeRFs) with (explicit) skeleton-driven mesh articulation. During optimization, +we transition between implicit and explicit forms, grounding the NeRF +generation while refining the mesh articulation. We validate our approach +through extensive experiments, demonstrating its effectiveness in generating +realistic HOIs. + +
+
+ comment: Project page: https://DreamHOI.github.io/ +
+
+
+
+
+ + ☆ Depth on Demand: Streaming Dense Depth from a Low Frame Rate Active + Sensor ECCV + + +
+ High frame rate and accurate depth estimation plays an important role in +several tasks crucial to robotics and automotive perception. To date, this can +be achieved through ToF and LiDAR devices for indoor and outdoor applications, +respectively. However, their applicability is limited by low frame rate, energy +consumption, and spatial sparsity. Depth on Demand (DoD) allows for accurate +temporal and spatial depth densification achieved by exploiting a high frame +rate RGB sensor coupled with a potentially lower frame rate and sparse active +depth sensor. Our proposal jointly enables lower energy consumption and denser +shape reconstruction, by significantly reducing the streaming requirements on +the depth sensor thanks to its three core stages: i) multi-modal encoding, ii) +iterative multi-modal integration, and iii) depth decoding. We present extended +evidence assessing the effectiveness of DoD on indoor and outdoor video +datasets, covering both environment scanning and automotive perception use +cases. + +
+
+ comment: Accepted for publication at the European Conference on Computer + Vision (ECCV) 2024 +
+
+
+
+
+ + ☆ DeCLIP: Decoding CLIP representations for deepfake localization WACV + + +
+ Generative models can create entirely new images, but they can also partially +modify real images in ways that are undetectable to the human eye. In this +paper, we address the challenge of automatically detecting such local +manipulations. One of the most pressing problems in deepfake detection remains +the ability of models to generalize to different classes of generators. In the +case of fully manipulated images, representations extracted from large +self-supervised models (such as CLIP) provide a promising direction towards +more robust detectors. Here, we introduce DeCLIP, a first attempt to leverage +such large pretrained features for detecting local manipulations. We show that, +when combined with a reasonably large convolutional decoder, pretrained +self-supervised representations are able to perform localization and improve +generalization capabilities over existing methods. Unlike previous work, our +approach is able to perform localization on the challenging case of latent +diffusion models, where the entire image is affected by the fingerprint of the +generator. Moreover, we observe that this type of data, which combines local +semantic information with a global fingerprint, provides more stable +generalization than other categories of generative methods. + +
+
+ comment: Accepted at Winter Conference on Applications of Computer Vision + (WACV) 2025 +
+
+
+
+
+ + ☆ Hand-Object Interaction Pretraining from Videos + + +
+ We present an approach to learn general robot manipulation priors from 3D +hand-object interaction trajectories. We build a framework to use in-the-wild +videos to generate sensorimotor robot trajectories. We do so by lifting both +the human hand and the manipulated object in a shared 3D space and retargeting +human motions to robot actions. Generative modeling on this data gives us a +task-agnostic base policy. This policy captures a general yet flexible +manipulation prior. We empirically demonstrate that finetuning this policy, +with both reinforcement learning (RL) and behavior cloning (BC), enables +sample-efficient adaptation to downstream tasks and simultaneously improves +robustness and generalizability compared to prior approaches. Qualitative +experiments are available at: \url{https://hgaurav2k.github.io/hop/}. + +
+
+
+
+
+ + ☆ Click2Mask: Local Editing with Dynamic Mask Generation + + +
+ Recent advancements in generative models have revolutionized image generation +and editing, making these tasks accessible to non-experts. This paper focuses +on local image editing, particularly the task of adding new content to a +loosely specified area. Existing methods often require a precise mask or a +detailed description of the location, which can be cumbersome and prone to +errors. We propose Click2Mask, a novel approach that simplifies the local +editing process by requiring only a single point of reference (in addition to +the content description). A mask is dynamically grown around this point during +a Blended Latent Diffusion (BLD) process, guided by a masked CLIP-based +semantic loss. Click2Mask surpasses the limitations of segmentation-based and +fine-tuning dependent methods, offering a more user-friendly and contextually +accurate solution. Our experiments demonstrate that Click2Mask not only +minimizes user effort but also delivers competitive or superior local image +manipulation results compared to SoTA methods, according to both human +judgement and automatic metrics. Key contributions include the simplification +of user input, the ability to freely add objects unconstrained by existing +segments, and the integration potential of our dynamic mask approach within +other editing methods. + +
+
+ comment: Project page is available at https://omeregev.github.io/click2mask/ +
+
+
+
+
+ + ☆ DreamBeast: Distilling 3D Fantastical Animals with Part-Aware Knowledge + Transfer + + +
+ We present DreamBeast, a novel method based on score distillation sampling +(SDS) for generating fantastical 3D animal assets composed of distinct parts. +Existing SDS methods often struggle with this generation task due to a limited +understanding of part-level semantics in text-to-image diffusion models. While +recent diffusion models, such as Stable Diffusion 3, demonstrate a better +part-level understanding, they are prohibitively slow and exhibit other common +problems associated with single-view diffusion models. DreamBeast overcomes +this limitation through a novel part-aware knowledge transfer mechanism. For +each generated asset, we efficiently extract part-level knowledge from the +Stable Diffusion 3 model into a 3D Part-Affinity implicit representation. This +enables us to instantly generate Part-Affinity maps from arbitrary camera +views, which we then use to modulate the guidance of a multi-view diffusion +model during SDS to create 3D assets of fantastical animals. DreamBeast +significantly enhances the quality of generated 3D creatures with +user-specified part compositions while reducing computational overhead, as +demonstrated by extensive quantitative and qualitative evaluations. + +
+
+ comment: Project page: https://dreambeast3d.github.io/, code: + https://github.com/runjiali-rl/threestudio-dreambeast +
+
+
+
+
+ + ☆ FlashSplat: 2D to 3D Gaussian Splatting Segmentation Solved Optimally ECCV'2024 + + +
+ This study addresses the challenge of accurately segmenting 3D Gaussian +Splatting from 2D masks. Conventional methods often rely on iterative gradient +descent to assign each Gaussian a unique label, leading to lengthy optimization +and sub-optimal solutions. Instead, we propose a straightforward yet globally +optimal solver for 3D-GS segmentation. The core insight of our method is that, +with a reconstructed 3D-GS scene, the rendering of the 2D masks is essentially +a linear function with respect to the labels of each Gaussian. As such, the +optimal label assignment can be solved via linear programming in closed form. +This solution capitalizes on the alpha blending characteristic of the splatting +process for single step optimization. By incorporating the background bias in +our objective function, our method shows superior robustness in 3D segmentation +against noises. Remarkably, our optimization completes within 30 seconds, about +50$\times$ faster than the best existing methods. Extensive experiments +demonstrate the efficiency and robustness of our method in segmenting various +scenes, and its superior performance in downstream tasks such as object removal +and inpainting. Demos and code will be available at +https://github.com/florinshen/FlashSplat. + +
+
+ comment: ECCV'2024 +
+
+
+
+
+ + ☆ Improving Text-guided Object Inpainting with Semantic Pre-inpainting ECCV 2024 + + +
+ Recent years have witnessed the success of large text-to-image diffusion +models and their remarkable potential to generate high-quality images. The +further pursuit of enhancing the editability of images has sparked significant +interest in the downstream task of inpainting a novel object described by a +text prompt within a designated region in the image. Nevertheless, the problem +is not trivial from two aspects: 1) Solely relying on one single U-Net to align +text prompt and visual object across all the denoising timesteps is +insufficient to generate desired objects; 2) The controllability of object +generation is not guaranteed in the intricate sampling space of diffusion +model. In this paper, we propose to decompose the typical single-stage object +inpainting into two cascaded processes: 1) semantic pre-inpainting that infers +the semantic features of desired objects in a multi-modal feature space; 2) +high-fieldity object generation in diffusion latent space that pivots on such +inpainted semantic features. To achieve this, we cascade a Transformer-based +semantic inpainter and an object inpainting diffusion model, leading to a novel +CAscaded Transformer-Diffusion (CAT-Diffusion) framework for text-guided object +inpainting. Technically, the semantic inpainter is trained to predict the +semantic features of the target object conditioning on unmasked context and +text prompt. The outputs of the semantic inpainter then act as the informative +visual prompts to guide high-fieldity object generation through a reference +adapter layer, leading to controllable object inpainting. Extensive evaluations +on OpenImages-V6 and MSCOCO validate the superiority of CAT-Diffusion against +the state-of-the-art methods. Code is available at +\url{https://github.com/Nnn-s/CATdiffusion}. + +
+
+ comment: ECCV 2024. Source code is available at + https://github.com/Nnn-s/CATdiffusion +
+
+
+
+
+ + ☆ Improving Virtual Try-On with Garment-focused Diffusion Models ECCV 2024 + + +
+ Diffusion models have led to the revolutionizing of generative modeling in +numerous image synthesis tasks. Nevertheless, it is not trivial to directly +apply diffusion models for synthesizing an image of a target person wearing a +given in-shop garment, i.e., image-based virtual try-on (VTON) task. The +difficulty originates from the aspect that the diffusion process should not +only produce holistically high-fidelity photorealistic image of the target +person, but also locally preserve every appearance and texture detail of the +given garment. To address this, we shape a new Diffusion model, namely GarDiff, +which triggers the garment-focused diffusion process with amplified guidance of +both basic visual appearance and detailed textures (i.e., high-frequency +details) derived from the given garment. GarDiff first remoulds a pre-trained +latent diffusion model with additional appearance priors derived from the CLIP +and VAE encodings of the reference garment. Meanwhile, a novel garment-focused +adapter is integrated into the UNet of diffusion model, pursuing local +fine-grained alignment with the visual appearance of reference garment and +human pose. We specifically design an appearance loss over the synthesized +garment to enhance the crucial, high-frequency details. Extensive experiments +on VITON-HD and DressCode datasets demonstrate the superiority of our GarDiff +when compared to state-of-the-art VTON approaches. Code is publicly available +at: +\href{https://github.com/siqi0905/GarDiff/tree/master}{https://github.com/siqi0905/GarDiff/tree/master}. + +
+
+ comment: ECCV 2024. Source code is available at + https://github.com/siqi0905/GarDiff/tree/master +
+
+
+
+
+ + ☆ Dynamic Prompting of Frozen Text-to-Image Diffusion Models for Panoptic + Narrative Grounding ACM MM 2024 + + +
+ Panoptic narrative grounding (PNG), whose core target is fine-grained +image-text alignment, requires a panoptic segmentation of referred objects +given a narrative caption. Previous discriminative methods achieve only weak or +coarse-grained alignment by panoptic segmentation pretraining or CLIP model +adaptation. Given the recent progress of text-to-image Diffusion models, +several works have shown their capability to achieve fine-grained image-text +alignment through cross-attention maps and improved general segmentation +performance. However, the direct use of phrase features as static prompts to +apply frozen Diffusion models to the PNG task still suffers from a large task +gap and insufficient vision-language interaction, yielding inferior +performance. Therefore, we propose an Extractive-Injective Phrase Adapter +(EIPA) bypass within the Diffusion UNet to dynamically update phrase prompts +with image features and inject the multimodal cues back, which leverages the +fine-grained image-text alignment capability of Diffusion models more +sufficiently. In addition, we also design a Multi-Level Mutual Aggregation +(MLMA) module to reciprocally fuse multi-level image and phrase features for +segmentation refinement. Extensive experiments on the PNG benchmark show that +our method achieves new state-of-the-art performance. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ TextBoost: Towards One-Shot Personalization of Text-to-Image Models via + Fine-tuning Text Encoder + + +
+ Recent breakthroughs in text-to-image models have opened up promising +research avenues in personalized image generation, enabling users to create +diverse images of a specific subject using natural language prompts. However, +existing methods often suffer from performance degradation when given only a +single reference image. They tend to overfit the input, producing highly +similar outputs regardless of the text prompt. This paper addresses the +challenge of one-shot personalization by mitigating overfitting, enabling the +creation of controllable images through text prompts. Specifically, we propose +a selective fine-tuning strategy that focuses on the text encoder. Furthermore, +we introduce three key techniques to enhance personalization performance: (1) +augmentation tokens to encourage feature disentanglement and alleviate +overfitting, (2) a knowledge-preservation loss to reduce language drift and +promote generalizability across diverse prompts, and (3) SNR-weighted sampling +for efficient training. Extensive experiments demonstrate that our approach +efficiently generates high-quality, diverse images using only a single +reference image while significantly reducing memory and storage requirements. + +
+
+ comment: Project page: https://textboost.github.io +
+
+
+
+
+ + ☆ Style Based Clustering of Visual Artworks + + +
+ Clustering artworks based on style has many potential real-world applications +like art recommendations, style-based search and retrieval, and the study of +artistic style evolution in an artwork corpus. However, clustering artworks +based on style is largely an unaddressed problem. A few present methods for +clustering artworks principally rely on generic image feature representations +derived from deep neural networks and do not specifically deal with the +artistic style. In this paper, we introduce and deliberate over the notion of +style-based clustering of visual artworks. Our main objective is to explore +neural feature representations and architectures that can be used for +style-based clustering and observe their impact and effectiveness. We develop +different methods and assess their relative efficacy for style-based clustering +through qualitative and quantitative analysis by applying them to four artwork +corpora and four curated synthetically styled datasets. Our analysis provides +some key novel insights on architectures, feature representations, and +evaluation methods suitable for style-based clustering. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ IFAdapter: Instance Feature Control for Grounded Text-to-Image + Generation + + +
+ While Text-to-Image (T2I) diffusion models excel at generating visually +appealing images of individual instances, they struggle to accurately position +and control the features generation of multiple instances. The Layout-to-Image +(L2I) task was introduced to address the positioning challenges by +incorporating bounding boxes as spatial control signals, but it still falls +short in generating precise instance features. In response, we propose the +Instance Feature Generation (IFG) task, which aims to ensure both positional +accuracy and feature fidelity in generated instances. To address the IFG task, +we introduce the Instance Feature Adapter (IFAdapter). The IFAdapter enhances +feature depiction by incorporating additional appearance tokens and utilizing +an Instance Semantic Map to align instance-level features with spatial +locations. The IFAdapter guides the diffusion process as a plug-and-play +module, making it adaptable to various community models. For evaluation, we +contribute an IFG benchmark and develop a verification pipeline to objectively +compare models' abilities to generate instances with accurate positioning and +features. Experimental results demonstrate that IFAdapter outperforms other +models in both quantitative and qualitative evaluations. + +
+
+
+
+
+ + ☆ Model Ensemble for Brain Tumor Segmentation in Magnetic Resonance + Imaging MICCAI 2023 + + +
+ Segmenting brain tumors in multi-parametric magnetic resonance imaging +enables performing quantitative analysis in support of clinical trials and +personalized patient care. This analysis provides the potential to impact +clinical decision-making processes, including diagnosis and prognosis. In 2023, +the well-established Brain Tumor Segmentation (BraTS) challenge presented a +substantial expansion with eight tasks and 4,500 brain tumor cases. In this +paper, we present a deep learning-based ensemble strategy that is evaluated for +newly included tumor cases in three tasks: pediatric brain tumors (PED), +intracranial meningioma (MEN), and brain metastases (MET). In particular, we +ensemble outputs from state-of-the-art nnU-Net and Swin UNETR models on a +region-wise basis. Furthermore, we implemented a targeted post-processing +strategy based on a cross-validated threshold search to improve the +segmentation results for tumor sub-regions. The evaluation of our proposed +method on unseen test cases for the three tasks resulted in lesion-wise Dice +scores for PED: 0.653, 0.809, 0.826; MEN: 0.876, 0.867, 0.849; and MET: 0.555, +0.6, 0.58; for the enhancing tumor, tumor core, and whole tumor, respectively. +Our method was ranked first for PED, third for MEN, and fourth for MET, +respectively. + +
+
+ comment: 11 pages, 6 figures, 2 tables; This method ranked 1st, 3rd and 4th + for BraTS2023 PED, MEN, and MET, respectively. This paper was accepted at + MICCAI 2023's BrainLes Workshop +
+
+
+
+
+ + ☆ LT3SD: Latent Trees for 3D Scene Diffusion + + +
+ We present LT3SD, a novel latent diffusion model for large-scale 3D scene +generation. Recent advances in diffusion models have shown impressive results +in 3D object generation, but are limited in spatial extent and quality when +extended to 3D scenes. To generate complex and diverse 3D scene structures, we +introduce a latent tree representation to effectively encode both +lower-frequency geometry and higher-frequency detail in a coarse-to-fine +hierarchy. We can then learn a generative diffusion process in this latent 3D +scene space, modeling the latent components of a scene at each resolution +level. To synthesize large-scale scenes with varying sizes, we train our +diffusion model on scene patches and synthesize arbitrary-sized output 3D +scenes through shared diffusion generation across multiple scene patches. +Through extensive experiments, we demonstrate the efficacy and benefits of +LT3SD for large-scale, high-quality unconditional 3D scene generation and for +probabilistic completion for partial scene observations. + +
+
+ comment: Project page: https://quan-meng.github.io/projects/lt3sd/ Video: + https://youtu.be/AJ5sG9VyjGA +
+
+
+
+
+ + ☆ VI3DRM:Towards meticulous 3D Reconstruction from Sparse Views via + Photo-Realistic Novel View Synthesis + + +
+ Recently, methods like Zero-1-2-3 have focused on single-view based 3D +reconstruction and have achieved remarkable success. However, their predictions +for unseen areas heavily rely on the inductive bias of large-scale pretrained +diffusion models. Although subsequent work, such as DreamComposer, attempts to +make predictions more controllable by incorporating additional views, the +results remain unrealistic due to feature entanglement in the vanilla latent +space, including factors such as lighting, material, and structure. To address +these issues, we introduce the Visual Isotropy 3D Reconstruction Model +(VI3DRM), a diffusion-based sparse views 3D reconstruction model that operates +within an ID consistent and perspective-disentangled 3D latent space. By +facilitating the disentanglement of semantic information, color, material +properties and lighting, VI3DRM is capable of generating highly realistic +images that are indistinguishable from real photographs. By leveraging both +real and synthesized images, our approach enables the accurate construction of +pointmaps, ultimately producing finely textured meshes or point clouds. On the +NVS task, tested on the GSO dataset, VI3DRM significantly outperforms +state-of-the-art method DreamComposer, achieving a PSNR of 38.61, an SSIM of +0.929, and an LPIPS of 0.027. Code will be made available upon publication. + +
+
+
+
+
+ + ☆ ComAlign: Compositional Alignment in Vision-Language Models + + +
+ Vision-language models (VLMs) like CLIP have showcased a remarkable ability +to extract transferable features for downstream tasks. Nonetheless, the +training process of these models is usually based on a coarse-grained +contrastive loss between the global embedding of images and texts which may +lose the compositional structure of these modalities. Many recent studies have +shown VLMs lack compositional understandings like attribute binding and +identifying object relationships. Although some recent methods have tried to +achieve finer-level alignments, they either are not based on extracting +meaningful components of proper granularity or don't properly utilize the +modalities' correspondence (especially in image-text pairs with more +ingredients). Addressing these limitations, we introduce Compositional +Alignment (ComAlign), a fine-grained approach to discover more exact +correspondence of text and image components using only the weak supervision in +the form of image-text pairs. Our methodology emphasizes that the compositional +structure (including entities and relations) extracted from the text modality +must also be retained in the image modality. To enforce correspondence of +fine-grained concepts in image and text modalities, we train a lightweight +network lying on top of existing visual and language encoders using a small +dataset. The network is trained to align nodes and edges of the structure +across the modalities. Experimental results on various VLMs and datasets +demonstrate significant improvements in retrieval and compositional benchmarks, +affirming the effectiveness of our plugin model. + +
+
+
+
+
+ + ☆ What Makes a Maze Look Like a Maze? + + +
+ A unique aspect of human visual understanding is the ability to flexibly +interpret abstract concepts: acquiring lifted rules explaining what they +symbolize, grounding them across familiar and unfamiliar contexts, and making +predictions or reasoning about them. While off-the-shelf vision-language models +excel at making literal interpretations of images (e.g., recognizing object +categories such as tree branches), they still struggle to make sense of such +visual abstractions (e.g., how an arrangement of tree branches may form the +walls of a maze). To address this challenge, we introduce Deep Schema Grounding +(DSG), a framework that leverages explicit structured representations of visual +abstractions for grounding and reasoning. At the core of DSG are +schemas--dependency graph descriptions of abstract concepts that decompose them +into more primitive-level symbols. DSG uses large language models to extract +schemas, then hierarchically grounds concrete to abstract components of the +schema onto images with vision-language models. The grounded schema is used to +augment visual abstraction understanding. We systematically evaluate DSG and +different methods in reasoning on our new Visual Abstractions Dataset, which +consists of diverse, real-world images of abstract concepts and corresponding +question-answer pairs labeled by humans. We show that DSG significantly +improves the abstract visual reasoning performance of vision-language models, +and is a step toward human-aligned understanding of visual abstractions. + +
+
+
+
+
+ + ☆ Gaussian Garments: Reconstructing Simulation-Ready Clothing with + Photorealistic Appearance from Multi-View Video + + +
+ We introduce Gaussian Garments, a novel approach for reconstructing realistic +simulation-ready garment assets from multi-view videos. Our method represents +garments with a combination of a 3D mesh and a Gaussian texture that encodes +both the color and high-frequency surface details. This representation enables +accurate registration of garment geometries to multi-view videos and helps +disentangle albedo textures from lighting effects. Furthermore, we demonstrate +how a pre-trained graph neural network (GNN) can be fine-tuned to replicate the +real behavior of each garment. The reconstructed Gaussian Garments can be +automatically combined into multi-garment outfits and animated with the +fine-tuned GNN. + +
+
+
+
+
+ + ☆ Enhancing Canine Musculoskeletal Diagnoses: Leveraging Synthetic Image + Data for Pre-Training AI-Models on Visual Documentations + + +
+ The examination of the musculoskeletal system in dogs is a challenging task +in veterinary practice. In this work, a novel method has been developed that +enables efficient documentation of a dog's condition through a visual +representation. However, since the visual documentation is new, there is no +existing training data. The objective of this work is therefore to mitigate the +impact of data scarcity in order to develop an AI-based diagnostic support +system. To this end, the potential of synthetic data that mimics realistic +visual documentations of diseases for pre-training AI models is investigated. +We propose a method for generating synthetic image data that mimics realistic +visual documentations. Initially, a basic dataset containing three distinct +classes is generated, followed by the creation of a more sophisticated dataset +containing 36 different classes. Both datasets are used for the pre-training of +an AI model. Subsequently, an evaluation dataset is created, consisting of 250 +manually created visual documentations for five different diseases. This +dataset, along with a subset containing 25 examples. The obtained results on +the evaluation dataset containing 25 examples demonstrate a significant +enhancement of approximately 10% in diagnosis accuracy when utilizing generated +synthetic images that mimic real-world visual documentations. However, these +results do not hold true for the larger evaluation dataset containing 250 +examples, indicating that the advantages of using synthetic data for +pre-training an AI model emerge primarily when dealing with few examples of +visual documentations for a given disease. Overall, this work provides valuable +insights into mitigating the limitations imposed by limited training data +through the strategic use of generated synthetic data, presenting an approach +applicable beyond the canine musculoskeletal assessment domain. + +
+
+
+
+
+ + ☆ Low-Cost Tree Crown Dieback Estimation Using Deep Learning-Based + Segmentation + + +
+ The global increase in observed forest dieback, characterised by the death of +tree foliage, heralds widespread decline in forest ecosystems. This degradation +causes significant changes to ecosystem services and functions, including +habitat provision and carbon sequestration, which can be difficult to detect +using traditional monitoring techniques, highlighting the need for large-scale +and high-frequency monitoring. Contemporary developments in the instruments and +methods to gather and process data at large-scales mean this monitoring is now +possible. In particular, the advancement of low-cost drone technology and deep +learning on consumer-level hardware provide new opportunities. Here, we use an +approach based on deep learning and vegetation indices to assess crown dieback +from RGB aerial data without the need for expensive instrumentation such as +LiDAR. We use an iterative approach to match crown footprints predicted by deep +learning with field-based inventory data from a Mediterranean ecosystem +exhibiting drought-induced dieback, and compare expert field-based crown +dieback estimation with vegetation index-based estimates. We obtain high +overall segmentation accuracy (mAP: 0.519) without the need for additional +technical development of the underlying Mask R-CNN model, underscoring the +potential of these approaches for non-expert use and proving their +applicability to real-world conservation. We also find colour-coordinate based +estimates of dieback correlate well with expert field-based estimation. +Substituting ground truth for Mask R-CNN model predictions showed negligible +impact on dieback estimates, indicating robustness. Our findings demonstrate +the potential of automated data collection and processing, including the +application of deep learning, to improve the coverage, speed and cost of forest +dieback monitoring. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ AD-Lite Net: A Lightweight and Concatenated CNN Model for Alzheimer's + Detection from MRI Images + + +
+ Alzheimer's Disease (AD) is a non-curable progressive neurodegenerative +disorder that affects the human brain, leading to a decline in memory, +cognitive abilities, and eventually, the ability to carry out daily tasks. +Manual diagnosis of Alzheimer's disease from MRI images is fraught with less +sensitivity and it is a very tedious process for neurologists. Therefore, there +is a need for an automatic Computer Assisted Diagnosis (CAD) system, which can +detect AD at early stages with higher accuracy. In this research, we have +proposed a novel AD-Lite Net model (trained from scratch), that could alleviate +the aforementioned problem. The novelties we bring here in this research are, +(I) We have proposed a very lightweight CNN model by incorporating Depth Wise +Separable Convolutional (DWSC) layers and Global Average Pooling (GAP) layers. +(II) We have leveraged a ``parallel concatenation block'' (pcb), in the +proposed AD-Lite Net model. This pcb consists of a Transformation layer +(Tx-layer), followed by two convolutional layers, which are thereby +concatenated with the original base model. This Tx-layer converts the features +into very distinct kind of features, which are imperative for the Alzheimer's +disease. As a consequence, the proposed AD-Lite Net model with ``parallel +concatenation'' converges faster and automatically mitigates the class +imbalance problem from the MRI datasets in a very generalized way. For the +validity of our proposed model, we have implemented it on three different MRI +datasets. Furthermore, we have combined the ADNI and AD datasets and +subsequently performed a 10-fold cross-validation experiment to verify the +model's generalization ability. Extensive experimental results showed that our +proposed model has outperformed all the existing CNN models, and one recent +trend Vision Transformer (ViT) model by a significant margin. + +
+
+ comment: NA +
+
+
+
+
+ + ☆ Learning to Match 2D Keypoints Across Preoperative MR and Intraoperative + Ultrasound MICCAI 2024 + + +
+ We propose in this paper a texture-invariant 2D keypoints descriptor +specifically designed for matching preoperative Magnetic Resonance (MR) images +with intraoperative Ultrasound (US) images. We introduce a +matching-by-synthesis strategy, where intraoperative US images are synthesized +from MR images accounting for multiple MR modalities and intraoperative US +variability. We build our training set by enforcing keypoints localization over +all images then train a patient-specific descriptor network that learns +texture-invariant discriminant features in a supervised contrastive manner, +leading to robust keypoints descriptors. Our experiments on real cases with +ground truth show the effectiveness of the proposed approach, outperforming the +state-of-the-art methods and achieving 80.35% matching precision on average. + +
+
+ comment: Accepted for publication at the International Workshop of Advances in + Simplifying Medical UltraSound (ASMUS) at MICCAI 2024 +
+
+
+
+
+ + ☆ High-Frequency Anti-DreamBooth: Robust Defense Against Image Synthesis ECCV 2024 + + +
+ Recently, text-to-image generative models have been misused to create +unauthorized malicious images of individuals, posing a growing social problem. +Previous solutions, such as Anti-DreamBooth, add adversarial noise to images to +protect them from being used as training data for malicious generation. +However, we found that the adversarial noise can be removed by adversarial +purification methods such as DiffPure. Therefore, we propose a new adversarial +attack method that adds strong perturbation on the high-frequency areas of +images to make it more robust to adversarial purification. Our experiment +showed that the adversarial images retained noise even after adversarial +purification, hindering malicious image generation. + +
+
+ comment: ECCV 2024 Workshop The Dark Side of Generative AIs and Beyond +
+
+
+
+
+ + ☆ Open Source Infrastructure for Automatic Cell Segmentation + + +
+ Automated cell segmentation is crucial for various biological and medical +applications, facilitating tasks like cell counting, morphology analysis, and +drug discovery. However, manual segmentation is time-consuming and prone to +subjectivity, necessitating robust automated methods. This paper presents +open-source infrastructure, utilizing the UNet model, a deep-learning +architecture noted for its effectiveness in image segmentation tasks. This +implementation is integrated into the open-source DeepChem package, enhancing +accessibility and usability for researchers and practitioners. The resulting +tool offers a convenient and user-friendly interface, reducing the barrier to +entry for cell segmentation while maintaining high accuracy. Additionally, we +benchmark this model against various datasets, demonstrating its robustness and +versatility across different imaging conditions and cell types. + +
+
+
+
+
+ + ☆ Cross-Attention Based Influence Model for Manual and Nonmanual Sign + Language Analysis + + +
+ Both manual (relating to the use of hands) and non-manual markers (NMM), such +as facial expressions or mouthing cues, are important for providing the +complete meaning of phrases in American Sign Language (ASL). Efforts have been +made in advancing sign language to spoken/written language understanding, but +most of these have primarily focused on manual features only. In this work, +using advanced neural machine translation methods, we examine and report on the +extent to which facial expressions contribute to understanding sign language +phrases. We present a sign language translation architecture consisting of +two-stream encoders, with one encoder handling the face and the other handling +the upper body (with hands). We propose a new parallel cross-attention decoding +mechanism that is useful for quantifying the influence of each input modality +on the output. The two streams from the encoder are directed simultaneously to +different attention stacks in the decoder. Examining the properties of the +parallel cross-attention weights allows us to analyze the importance of facial +markers compared to body and hand features during a translating task. + +
+
+
+
+
+ + ☆ SDformer: Efficient End-to-End Transformer for Depth Completion + + +
+ Depth completion aims to predict dense depth maps with sparse depth +measurements from a depth sensor. Currently, Convolutional Neural Network (CNN) +based models are the most popular methods applied to depth completion tasks. +However, despite the excellent high-end performance, they suffer from a limited +representation area. To overcome the drawbacks of CNNs, a more effective and +powerful method has been presented: the Transformer, which is an adaptive +self-attention setting sequence-to-sequence model. While the standard +Transformer quadratically increases the computational cost from the key-query +dot-product of input resolution which improperly employs depth completion +tasks. In this work, we propose a different window-based Transformer +architecture for depth completion tasks named Sparse-to-Dense Transformer +(SDformer). The network consists of an input module for the depth map and RGB +image features extraction and concatenation, a U-shaped encoder-decoder +Transformer for extracting deep features, and a refinement module. +Specifically, we first concatenate the depth map features with the RGB image +features through the input model. Then, instead of calculating self-attention +with the whole feature maps, we apply different window sizes to extract the +long-range depth dependencies. Finally, we refine the predicted features from +the input module and the U-shaped encoder-decoder Transformer module to get the +enriching depth features and employ a convolution layer to obtain the dense +depth map. In practice, the SDformer obtains state-of-the-art results against +the CNN-based depth completion models with lower computing loads and parameters +on the NYU Depth V2 and KITTI DC datasets. + +
+
+ comment: Presented at the International Conference on Industrial Automation, + Robotics and Control Engineering (IARCE) 2022 +
+
+
+
+
+ + ☆ MagicStyle: Portrait Stylization Based on Reference Image + + +
+ The development of diffusion models has significantly advanced the research +on image stylization, particularly in the area of stylizing a content image +based on a given style image, which has attracted many scholars. The main +challenge in this reference image stylization task lies in how to maintain the +details of the content image while incorporating the color and texture features +of the style image. This challenge becomes even more pronounced when the +content image is a portrait which has complex textural details. To address this +challenge, we propose a diffusion model-based reference image stylization +method specifically for portraits, called MagicStyle. MagicStyle consists of +two phases: Content and Style DDIM Inversion (CSDI) and Feature Fusion Forward +(FFF). The CSDI phase involves a reverse denoising process, where DDIM +Inversion is performed separately on the content image and the style image, +storing the self-attention query, key and value features of both images during +the inversion process. The FFF phase executes forward denoising, harmoniously +integrating the texture and color information from the pre-stored feature +queries, keys and values into the diffusion generation process based on our +Well-designed Feature Fusion Attention (FFA). We conducted comprehensive +comparative and ablation experiments to validate the effectiveness of our +proposed MagicStyle and FFA. + +
+
+
+
+
+ + ☆ Effective Segmentation of Post-Treatment Gliomas Using Simple + Approaches: Artificial Sequence Generation and Ensemble Models MICCAI + + +
+ Segmentation is a crucial task in the medical imaging field and is often an +important primary step or even a prerequisite to the analysis of medical +volumes. Yet treatments such as surgery complicate the accurate delineation of +regions of interest. The BraTS Post-Treatment 2024 Challenge published the +first public dataset for post-surgery glioma segmentation and addresses the +aforementioned issue by fostering the development of automated segmentation +tools for glioma in MRI data. In this effort, we propose two straightforward +approaches to enhance the segmentation performances of deep learning-based +methodologies. First, we incorporate an additional input based on a simple +linear combination of the available MRI sequences input, which highlights +enhancing tumors. Second, we employ various ensembling methods to weigh the +contribution of a battery of models. Our results demonstrate that these +approaches significantly improve segmentation performance compared to baseline +models, underscoring the effectiveness of these simple approaches in improving +medical image segmentation tasks. + +
+
+ comment: Invited for an Oral Presentation at the MICCAI BraTS Challenge 2024 +
+
+
+
+
+ + ☆ The JPEG Pleno Learning-based Point Cloud Coding Standard: Serving Man + and Machine + + +
+ Efficient point cloud coding has become increasingly critical for multiple +applications such as virtual reality, autonomous driving, and digital twin +systems, where rich and interactive 3D data representations may functionally +make the difference. Deep learning has emerged as a powerful tool in this +domain, offering advanced techniques for compressing point clouds more +efficiently than conventional coding methods while also allowing effective +computer vision tasks performed in the compressed domain thus, for the first +time, making available a common compressed visual representation effective for +both man and machine. Taking advantage of this potential, JPEG has recently +finalized the JPEG Pleno Learning-based Point Cloud Coding (PCC) standard +offering efficient lossy coding of static point clouds, targeting both human +visualization and machine processing by leveraging deep learning models for +geometry and color coding. The geometry is processed directly in its original +3D form using sparse convolutional neural networks, while the color data is +projected onto 2D images and encoded using the also learning-based JPEG AI +standard. The goal of this paper is to provide a complete technical description +of the JPEG PCC standard, along with a thorough benchmarking of its performance +against the state-of-the-art, while highlighting its main strengths and +weaknesses. In terms of compression performance, JPEG PCC outperforms the +conventional MPEG PCC standards, especially in geometry coding, achieving +significant rate reductions. Color compression performance is less competitive +but this is overcome by the power of a full learning-based coding framework for +both geometry and color and the associated effective compressed domain +processing. + +
+
+ comment: 28 pages, 12 figures, submitted to IEEE Access +
+
+
+
+
+ + ♻ ☆ Disentangling spatio-temporal knowledge for weakly supervised object + detection and segmentation in surgical video + + +
+ Weakly supervised video object segmentation (WSVOS) enables the +identification of segmentation maps without requiring an extensive training +dataset of object masks, relying instead on coarse video labels indicating +object presence. Current state-of-the-art methods either require multiple +independent stages of processing that employ motion cues or, in the case of +end-to-end trainable networks, lack in segmentation accuracy, in part due to +the difficulty of learning segmentation maps from videos with transient object +presence. This limits the application of WSVOS for semantic annotation of +surgical videos where multiple surgical tools frequently move in and out of the +field of view, a problem that is more difficult than typically encountered in +WSVOS. This paper introduces Video Spatio-Temporal Disentanglement Networks +(VDST-Net), a framework to disentangle spatiotemporal information using +semi-decoupled knowledge distillation to predict high-quality class activation +maps (CAMs). A teacher network designed to resolve temporal conflicts when +specifics about object location and timing in the video are not provided works +with a student network that integrates information over time by leveraging +temporal dependencies. We demonstrate the efficacy of our framework on a public +reference dataset and on a more challenging surgical video dataset where +objects are, on average, present in less than 60\% of annotated frames. Our +method outperforms state-of-the-art techniques and generates superior +segmentation masks under video-level weak supervision. + +
+
+ comment: 13 pages, 6 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Solving the Inverse Problem of Electrocardiography for Cardiac Digital + Twins: A Survey + + +
+ Cardiac digital twins (CDTs) are personalized virtual representations used to +understand complex cardiac mechanisms. A critical component of CDT development +is solving the ECG inverse problem, which enables the reconstruction of cardiac +sources and the estimation of patient-specific electrophysiology (EP) +parameters from surface ECG data. Despite challenges from complex cardiac +anatomy, noisy ECG data, and the ill-posed nature of the inverse problem, +recent advances in computational methods have greatly improved the accuracy and +efficiency of ECG inverse inference, strengthening the fidelity of CDTs. This +paper aims to provide a comprehensive review of the methods of solving ECG +inverse problem, the validation strategies, the clinical applications, and +future perspectives. For the methodologies, we broadly classify +state-of-the-art approaches into two categories: deterministic and +probabilistic methods, including both conventional and deep learning-based +techniques. Integrating physics laws with deep learning models holds promise, +but challenges such as capturing dynamic electrophysiology accurately, +accessing accurate domain knowledge, and quantifying prediction uncertainty +persist. Integrating models into clinical workflows while ensuring +interpretability and usability for healthcare professionals is essential. +Overcoming these challenges will drive further research in CDTs. + +
+
+
+
+
+ + ♻ ☆ Variational Bayes image restoration with compressive autoencoders + + +
+ Regularization of inverse problems is of paramount importance in +computational imaging. The ability of neural networks to learn efficient image +representations has been recently exploited to design powerful data-driven +regularizers. While state-of-the-art plug-and-play methods rely on an implicit +regularization provided by neural denoisers, alternative Bayesian approaches +consider Maximum A Posteriori (MAP) estimation in the latent space of a +generative model, thus with an explicit regularization. However, +state-of-the-art deep generative models require a huge amount of training data +compared to denoisers. Besides, their complexity hampers the optimization +involved in latent MAP derivation. In this work, we first propose to use +compressive autoencoders instead. These networks, which can be seen as +variational autoencoders with a flexible latent prior, are smaller and easier +to train than state-of-the-art generative models. As a second contribution, we +introduce the Variational Bayes Latent Estimation (VBLE) algorithm, which +performs latent estimation within the framework of variational inference. +Thanks to a simple yet efficient parameterization of the variational posterior, +VBLE allows for fast and easy (approximate) posterior sampling.Experimental +results on image datasets BSD and FFHQ demonstrate that VBLE reaches similar +performance than state-of-the-art plug-and-play methods, while being able to +quantify uncertainties significantly faster than other existing posterior +sampling techniques. + +
+
+
+
+
+ + ♻ ☆ Geospecific View Generation -- Geometry-Context Aware High-resolution + Ground View Inference from Satellite Views + + +
+ Predicting realistic ground views from satellite imagery in urban scenes is a +challenging task due to the significant view gaps between satellite and +ground-view images. We propose a novel pipeline to tackle this challenge, by +generating geospecifc views that maximally respect the weak geometry and +texture from multi-view satellite images. Different from existing approaches +that hallucinate images from cues such as partial semantics or geometry from +overhead satellite images, our method directly predicts ground-view images at +geolocation by using a comprehensive set of information from the satellite +image, resulting in ground-level images with a resolution boost at a factor of +ten or more. We leverage a novel building refinement method to reduce geometric +distortions in satellite data at ground level, which ensures the creation of +accurate conditions for view synthesis using diffusion networks. Moreover, we +proposed a novel geospecific prior, which prompts distribution learning of +diffusion models to respect image samples that are closer to the geolocation of +the predicted images. We demonstrate our pipeline is the first to generate +close-to-real and geospecific ground views merely based on satellite images. + +
+
+ comment: 11 figures +
+
+
+
+
+ + ♻ ☆ Prompt2Fashion: An automatically generated fashion dataset + + +
+ Despite the rapid evolution and increasing efficacy of language and vision +generative models, there remains a lack of comprehensive datasets that bridge +the gap between personalized fashion needs and AI-driven design, limiting the +potential for truly inclusive and customized fashion solutions. In this work, +we leverage generative models to automatically construct a fashion image +dataset tailored to various occasions, styles, and body types as instructed by +users. We use different Large Language Models (LLMs) and prompting strategies +to offer personalized outfits of high aesthetic quality, detail, and relevance +to both expert and non-expert users' requirements, as demonstrated by +qualitative analysis. Up until now the evaluation of the generated outfits has +been conducted by non-expert human subjects. Despite the provided fine-grained +insights on the quality and relevance of generation, we extend the discussion +on the importance of expert knowledge for the evaluation of artistic +AI-generated datasets such as this one. Our dataset is publicly available on +GitHub at https://github.com/georgiarg/Prompt2Fashion. + +
+
+
+
+
+ + ♻ ☆ Face Reconstruction Transfer Attack as Out-of-Distribution + Generalization ECCV2024 + + +
+ Understanding the vulnerability of face recognition systems to malicious +attacks is of critical importance. Previous works have focused on +reconstructing face images that can penetrate a targeted verification system. +Even in the white-box scenario, however, naively reconstructed images +misrepresent the identity information, hence the attacks are easily neutralized +once the face system is updated or changed. In this paper, we aim to +reconstruct face images which are capable of transferring face attacks on +unseen encoders. We term this problem as Face Reconstruction Transfer Attack +(FRTA) and show that it can be formulated as an out-of-distribution (OOD) +generalization problem. Inspired by its OOD nature, we propose to solve FRTA by +Averaged Latent Search and Unsupervised Validation with pseudo target (ALSUV). +To strengthen the reconstruction attack on OOD unseen encoders, ALSUV +reconstructs the face by searching the latent of amortized generator StyleGAN2 +through multiple latent optimization, latent optimization trajectory averaging, +and unsupervised validation with a pseudo target. We demonstrate the efficacy +and generalization of our method on widely used face datasets, accompanying it +with extensive ablation studies and visually, qualitatively, and quantitatively +analyses. The source code will be released. + +
+
+ comment: Accepted to ECCV2024 +
+
+
+
+
+ + ♻ ☆ What Makes a Face Look like a Hat: Decoupling Low-level and High-level + Visual Properties with Image Triplets ECCV 2024 + + +
+ In visual decision making, high-level features, such as object categories, +have a strong influence on choice. However, the impact of low-level features on +behavior is less understood partly due to the high correlation between high- +and low-level features in the stimuli presented (e.g., objects of the same +category are more likely to share low-level features). To disentangle these +effects, we propose a method that de-correlates low- and high-level visual +properties in a novel set of stimuli. Our method uses two Convolutional Neural +Networks (CNNs) as candidate models of the ventral visual stream: the CORnet-S +that has high neural predictivity in high-level, IT-like responses and the +VGG-16 that has high neural predictivity in low-level responses. Triplets +(root, image1, image2) of stimuli are parametrized by the level of low- and +high-level similarity of images extracted from the different layers. These +stimuli are then used in a decision-making task where participants are tasked +to choose the most similar-to-the-root image. We found that different networks +show differing abilities to predict the effects of low-versus-high-level +similarity: while CORnet-S outperforms VGG-16 in explaining human choices based +on high-level similarity, VGG-16 outperforms CORnet-S in explaining human +choices based on low-level similarity. Using Brain-Score, we observed that the +behavioral prediction abilities of different layers of these networks +qualitatively corresponded to their ability to explain neural activity at +different levels of the visual hierarchy. In summary, our algorithm for +stimulus set generation enables the study of how different representations in +the visual stream affect high-level cognitive behaviors. + +
+
+ comment: Accepted at Workshop on Human-inspired Computer Vision @ ECCV 2024 +
+
+
+
+
+ + ♻ ☆ DINOv2 Rocks Geological Image Analysis: Classification, Segmentation, + and Interpretability + + +
+ Recent advancements in computer vision have significantly improved image +analysis tasks. Yet, deep learning models often struggle when applied to +domains outside their training distribution, such as in geosciences, where +domain-specific data can be scarce. This study investigates the classification, +segmentation, and interpretability of CT-scan images of rock samples, focusing +on the application of modern computer vision techniques to geoscientific tasks. +We compare a range of segmentation methods to assess their efficacy, +efficiency, and adaptability in geological image analysis. The methods +evaluated include Otsu thresholding, clustering techniques (K-means, fuzzy +C-means), a supervised machine learning approach (Random Forest), and deep +learning models (UNet, ResNet152, and DINOv2), using ten binary sandstone +datasets and three multi-class calcite datasets. DINOv2 was selected for its +promising results in feature extraction and its potential applicability in +geoscientific tasks, prompting further assessment of its interpretability and +effectiveness in processing CT-scanned rock data. For classification, a +non-fine-tuned DINOv2 demonstrates strong performance in classifying rock +images, even when the CT-scans are outside its original training set. In +segmentation tasks, thresholding and clustering techniques, though +computationally efficient, produce subpar results despite preprocessing +efforts. In contrast, supervised methods achieve better performance. While deep +learning methods demand greater computational resources, they require minimal +intervention and offer superior generalization. A LoRA fine-tuned DINOv2, in +particular, excels in out-of-distribution segmentation and outperforms other +methods in multi-class tasks, even with limited data. Notably, the segmentation +masks generated by DINOv2 often appear more accurate than the original targets, +based on visual inspection. + +
+
+ comment: Minor typos fixing, added figures and revisions of some sections +
+
+
+
+
+ + ♻ ☆ Explicit Mutual Information Maximization for Self-Supervised Learning + + +
+ Recently, self-supervised learning (SSL) has been extensively studied. +Theoretically, mutual information maximization (MIM) is an optimal criterion +for SSL, with a strong theoretical foundation in information theory. However, +it is difficult to directly apply MIM in SSL since the data distribution is not +analytically available in applications. In practice, many existing methods can +be viewed as approximate implementations of the MIM criterion. This work shows +that, based on the invariance property of MI, explicit MI maximization can be +applied to SSL under a generic distribution assumption, i.e., a relaxed +condition of the data distribution. We further illustrate this by analyzing the +generalized Gaussian distribution. Based on this result, we derive a loss +function based on the MIM criterion using only second-order statistics. We +implement the new loss for SSL and demonstrate its effectiveness via extensive +experiments. + +
+
+
+
+
+ + ♻ ☆ The NPU-ASLP System Description for Visual Speech Recognition in CNVSRC + 2024 SC 2024 + + +
+ This paper delineates the visual speech recognition (VSR) system introduced +by the NPU-ASLP (Team 237) in the second Chinese Continuous Visual Speech +Recognition Challenge (CNVSRC 2024), engaging in all four tracks, including the +fixed and open tracks of Single-Speaker VSR Task and Multi-Speaker VSR Task. In +terms of data processing, we leverage the lip motion extractor from the +baseline1 to produce multiscale video data. Besides, various augmentation +techniques are applied during training, encompassing speed perturbation, random +rotation, horizontal flipping, and color transformation. The VSR model adopts +an end-to-end architecture with joint CTC/attention loss, introducing Enhanced +ResNet3D visual frontend, E-Branchformer encoder, and Bi-directional +Transformer decoder. Our approach yields a 30.47% CER for the Single-Speaker +Task and 34.30% CER for the Multi-Speaker Task, securing second place in the +open track of the Single-Speaker Task and first place in the other three +tracks. + +
+
+ comment: Included in CNVSRC Workshop 2024, NCMMSC 2024 +
+
+
+
+
+ + ♻ ☆ Zero-Shot Whole Slide Image Retrieval in Histopathology Using Embeddings + of Foundation Models + + +
+ We have tested recently published foundation models for histopathology for +image retrieval. We report macro average of F1 score for top-1 retrieval, +majority of top-3 retrievals, and majority of top-5 retrievals. We perform +zero-shot retrievals, i.e., we do not alter embeddings and we do not train any +classifier. As test data, we used diagnostic slides of TCGA, The Cancer Genome +Atlas, consisting of 23 organs and 117 cancer subtypes. As a search platform we +used Yottixel that enabled us to perform WSI search using patches. Achieved F1 +scores show low performance, e.g., for top-5 retrievals, 27% +/- 13% +(Yottixel-DenseNet), 42% +/- 14% (Yottixel-UNI), 40%+/-13% (Yottixel-Virchow), +41%+/-13% (Yottixel-GigaPath), and 41%+/-14% (GigaPath WSI). + +
+
+ comment: This paper will be updated with more results +
+
+
+
+
+ + ♻ ☆ DSDFormer: An Innovative Transformer-Mamba Framework for Robust + High-Precision Driver Distraction Identification + + +
+ Driver distraction remains a leading cause of traffic accidents, posing a +critical threat to road safety globally. As intelligent transportation systems +evolve, accurate and real-time identification of driver distraction has become +essential. However, existing methods struggle to capture both global contextual +and fine-grained local features while contending with noisy labels in training +datasets. To address these challenges, we propose DSDFormer, a novel framework +that integrates the strengths of Transformer and Mamba architectures through a +Dual State Domain Attention (DSDA) mechanism, enabling a balance between +long-range dependencies and detailed feature extraction for robust driver +behavior recognition. Additionally, we introduce Temporal Reasoning Confident +Learning (TRCL), an unsupervised approach that refines noisy labels by +leveraging spatiotemporal correlations in video sequences. Our model achieves +state-of-the-art performance on the AUC-V1, AUC-V2, and 100-Driver datasets and +demonstrates real-time processing efficiency on the NVIDIA Jetson AGX Orin +platform. Extensive experimental results confirm that DSDFormer and TRCL +significantly improve both the accuracy and robustness of driver distraction +detection, offering a scalable solution to enhance road safety. + +
+
+
+
+
+ + ♻ ☆ Unified Domain Adaptive Semantic Segmentation + + +
+ Unsupervised Domain Adaptive Semantic Segmentation (UDA-SS) aims to transfer +the supervision from a labeled source domain to an unlabeled target domain. The +majority of existing UDA-SS works typically consider images whilst recent +attempts have extended further to tackle videos by modeling the temporal +dimension. Although the two lines of research share the major challenges -- +overcoming the underlying domain distribution shift, their studies are largely +independent, resulting in fragmented insights, a lack of holistic +understanding, and missed opportunities for cross-pollination of ideas. This +fragmentation prevents the unification of methods, leading to redundant efforts +and suboptimal knowledge transfer across image and video domains. Under this +observation, we advocate unifying the study of UDA-SS across video and image +scenarios, enabling a more comprehensive understanding, synergistic +advancements, and efficient knowledge sharing. To that end, we explore the +unified UDA-SS from a general data augmentation perspective, serving as a +unifying conceptual framework, enabling improved generalization, and potential +for cross-pollination of ideas, ultimately contributing to the overall progress +and practical impact of this field of research. Specifically, we propose a +Quad-directional Mixup (QuadMix) method, characterized by tackling distinct +point attributes and feature inconsistencies through four-directional paths for +intra- and inter-domain mixing in a feature space. To deal with temporal shifts +with videos, we incorporate optical flow-guided feature aggregation across +spatial and temporal dimensions for fine-grained domain alignment. Extensive +experiments show that our method outperforms the state-of-the-art works by +large margins on four challenging UDA-SS benchmarks. Our source code and models +will be released at \url{https://github.com/ZHE-SAPI/UDASS}. + +
+
+ comment: 17 pages,11 figures, 11 tables, submitted to IEEE Transactions on + Pattern Analysis and Machine Intelligence +
+
+
+
+
+ + ♻ ☆ Contrastive Learning and the Emergence of Attributes Associations + + +
+ In response to an object presentation, supervised learning schemes generally +respond with a parsimonious label. Upon a similar presentation we humans +respond again with a label, but are flooded, in addition, by a myriad of +associations. A significant portion of these consist of the presented object +attributes. Contrastive learning is a semi-supervised learning scheme based on +the application of identity preserving transformations on the object input +representations. It is conjectured in this work that these same applied +transformations preserve, in addition to the identity of the presented object, +also the identity of its semantically meaningful attributes. The corollary of +this is that the output representations of such a contrastive learning scheme +contain valuable information not only for the classification of the presented +object, but also for the presence or absence decision of any attribute of +interest. Simulation results which demonstrate this idea and the feasibility of +this conjecture are presented. + +
+
+ comment: 10 pages +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ On the challenges of studying bias in Recommender Systems: A UserKNN + case study RecSys 2024 + + +
+ Statements on the propagation of bias by recommender systems are often hard +to verify or falsify. Research on bias tends to draw from a small pool of +publicly available datasets and is therefore bound by their specific +properties. Additionally, implementation choices are often not explicitly +described or motivated in research, while they may have an effect on bias +propagation. In this paper, we explore the challenges of measuring and +reporting popularity bias. We showcase the impact of data properties and +algorithm configurations on popularity bias by combining synthetic data with +well known recommender systems frameworks that implement UserKNN. First, we +identify data characteristics that might impact popularity bias, based on the +functionality of UserKNN. Accordingly, we generate various datasets that +combine these characteristics. Second, we locate UserKNN configurations that +vary across implementations in literature. We evaluate popularity bias for five +synthetic datasets and five UserKNN configurations, and offer insights on their +joint effect. We find that, depending on the data characteristics, various +UserKNN configurations can lead to different conclusions regarding the +propagation of popularity bias. These results motivate the need for explicitly +addressing algorithmic configuration and data properties when reporting and +interpreting bias in recommender systems. + +
+
+ comment: Accepted at FAccTRec@RecSys 2024, 11 pages +
+
+
+
+
+ + ☆ An Evaluation Framework for Attributed Information Retrieval using Large + Language Models + + +
+ With the growing success of Large Language models (LLMs) in +information-seeking scenarios, search engines are now adopting generative +approaches to provide answers along with in-line citations as attribution. +While existing work focuses mainly on attributed question answering, in this +paper, we target information-seeking scenarios which are often more challenging +due to the open-ended nature of the queries and the size of the label space in +terms of the diversity of candidate-attributed answers per query. We propose a +reproducible framework to evaluate and benchmark attributed information +seeking, using any backbone LLM, and different architectural designs: (1) +Generate (2) Retrieve then Generate, and (3) Generate then Retrieve. +Experiments using HAGRID, an attributed information-seeking dataset, show the +impact of different scenarios on both the correctness and attributability of +answers. + +
+
+
+
+
+ + ☆ Collaborative Automatic Modulation Classification via Deep Edge + Inference for Hierarchical Cognitive Radio Networks + + +
+ In hierarchical cognitive radio networks, edge or cloud servers utilize the +data collected by edge devices for modulation classification, which, however, +is faced with problems of the transmission overhead, data privacy, and +computation load. In this article, an edge learning (EL) based framework +jointly mobilizing the edge device and the edge server for intelligent +co-inference is proposed to realize the collaborative automatic modulation +classification (C-AMC) between them. A spectrum semantic compression neural +network (SSCNet) with the lightweight structure is designed for the edge device +to compress the collected raw data into a compact semantic message that is then +sent to the edge server via the wireless channel. On the edge server side, a +modulation classification neural network (MCNet) combining bidirectional long +short-term memory (Bi?LSTM) and multi-head attention layers is elaborated to +deter?mine the modulation type from the noisy semantic message. By leveraging +the computation resources of both the edge device and the edge server, high +transmission overhead and risks of data privacy leakage are avoided. The +simulation results verify the effectiveness of the proposed C-AMC framework, +significantly reducing the model size and computational complexity. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.20772 +
+
+
+
+
+ + ☆ Enhancing Cross-Market Recommendation System with Graph Isomorphism + Networks: A Novel Approach to Personalized User Experience + + +
+ In today's world of globalized commerce, cross-market recommendation systems +(CMRs) are crucial for providing personalized user experiences across diverse +market segments. However, traditional recommendation algorithms have +difficulties dealing with market specificity and data sparsity, especially in +new or emerging markets. In this paper, we propose the CrossGR model, which +utilizes Graph Isomorphism Networks (GINs) to improve CMR systems. It +outperforms existing benchmarks in NDCG@10 and HR@10 metrics, demonstrating its +adaptability and accuracy in handling diverse market segments. The CrossGR +model is adaptable and accurate, making it well-suited for handling the +complexities of cross-market recommendation tasks. Its robustness is +demonstrated by consistent performance across different evaluation timeframes, +indicating its potential to cater to evolving market trends and user +preferences. Our findings suggest that GINs represent a promising direction for +CMRs, paving the way for more sophisticated, personalized, and context-aware +recommendation systems in the dynamic landscape of global e-commerce. + +
+
+ comment: 7 pages, 1 figure, 3 tables, 5 equations +
+
+
+
+
+ + ☆ PDC-FRS: Privacy-preserving Data Contribution for Federated Recommender + System + + +
+ Federated recommender systems (FedRecs) have emerged as a popular research +direction for protecting users' privacy in on-device recommendations. In +FedRecs, users keep their data locally and only contribute their local +collaborative information by uploading model parameters to a central server. +While this rigid framework protects users' raw data during training, it +severely compromises the recommendation model's performance due to the +following reasons: (1) Due to the power law distribution nature of user +behavior data, individual users have few data points to train a recommendation +model, resulting in uploaded model updates that may be far from optimal; (2) As +each user's uploaded parameters are learned from local data, which lacks global +collaborative information, relying solely on parameter aggregation methods such +as FedAvg to fuse global collaborative information may be suboptimal. To bridge +this performance gap, we propose a novel federated recommendation framework, +PDC-FRS. Specifically, we design a privacy-preserving data contribution +mechanism that allows users to share their data with a differential privacy +guarantee. Based on the shared but perturbed data, an auxiliary model is +trained in parallel with the original federated recommendation process. This +auxiliary model enhances FedRec by augmenting each user's local dataset and +integrating global collaborative information. To demonstrate the effectiveness +of PDC-FRS, we conduct extensive experiments on two widely used recommendation +datasets. The empirical results showcase the superiority of PDC-FRS compared to +baseline methods. + +
+
+
+
+
+ + ☆ Music auto-tagging in the long tail: A few-shot approach + + +
+ In the realm of digital music, using tags to efficiently organize and +retrieve music from extensive databases is crucial for music catalog owners. +Human tagging by experts is labor-intensive but mostly accurate, whereas +automatic tagging through supervised learning has approached satisfying +accuracy but is restricted to a predefined set of training tags. Few-shot +learning offers a viable solution to expand beyond this small set of predefined +tags by enabling models to learn from only a few human-provided examples to +understand tag meanings and subsequently apply these tags autonomously. We +propose to integrate few-shot learning methodology into multi-label music +auto-tagging by using features from pre-trained models as inputs to a +lightweight linear classifier, also known as a linear probe. We investigate +different popular pre-trained features, as well as different few-shot +parametrizations with varying numbers of classes and samples per class. Our +experiments demonstrate that a simple model with pre-trained features can +achieve performance close to state-of-the-art models while using significantly +less training data, such as 20 samples per tag. Additionally, our linear probe +performs competitively with leading models when trained on the entire training +dataset. The results show that this transfer learning-based few-shot approach +could effectively address the issue of automatically assigning long-tail tags +with only limited labeled data. + +
+
+ comment: Published in Audio Engineering Society NY Show 2024 as a Peer + Reviewed (Category 1) paper +
+
+
+
+
+ + ☆ Harnessing TI Feeds for Exploitation Detection + + +
+ Many organizations rely on Threat Intelligence (TI) feeds to assess the risk +associated with security threats. Due to the volume and heterogeneity of data, +it is prohibitive to manually analyze the threat information available in +different loosely structured TI feeds. Thus, there is a need to develop +automated methods to vet and extract actionable information from TI feeds. To +this end, we present a machine learning pipeline to automatically detect +vulnerability exploitation from TI feeds. We first model threat vocabulary in +loosely structured TI feeds using state-of-the-art embedding techniques +(Doc2Vec and BERT) and then use it to train a supervised machine learning +classifier to detect exploitation of security vulnerabilities. We use our +approach to identify exploitation events in 191 different TI feeds. Our +longitudinal evaluation shows that it is able to accurately identify +exploitation events from TI feeds only using past data for training and even on +TI feeds withheld from training. Our proposed approach is useful for a variety +of downstream tasks such as data-driven vulnerability risk assessment. + +
+
+ comment: This paper appears at IEEE International Conference on Cyber Security + and Resilience (IEEE CSR 2024) +
+
+
+
+
+ + ☆ Enhancing Q&A Text Retrieval with Ranking Models: Benchmarking, + fine-tuning and deploying Rerankers for RAG CIKM 2024 + + +
+ Ranking models play a crucial role in enhancing overall accuracy of text +retrieval systems. These multi-stage systems typically utilize either dense +embedding models or sparse lexical indices to retrieve relevant passages based +on a given query, followed by ranking models that refine the ordering of the +candidate passages by its relevance to the query. + This paper benchmarks various publicly available ranking models and examines +their impact on ranking accuracy. We focus on text retrieval for +question-answering tasks, a common use case for Retrieval-Augmented Generation +systems. Our evaluation benchmarks include models some of which are +commercially viable for industrial applications. + We introduce a state-of-the-art ranking model, NV-RerankQA-Mistral-4B-v3, +which achieves a significant accuracy increase of ~14% compared to pipelines +with other rerankers. We also provide an ablation study comparing the +fine-tuning of ranking models with different sizes, losses and self-attention +mechanisms. + Finally, we discuss challenges of text retrieval pipelines with ranking +models in real-world industry applications, in particular the trade-offs among +model size, ranking accuracy and system requirements like indexing and serving +latency / throughput. + +
+
+ comment: Accepted for the 1st Workshop on GenAI and RAG Systems for Enterprise + @ CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Latent Diffusion Bridges for Unsupervised Musical Audio Timbre Transfer + + +
+ Music timbre transfer is a challenging task that involves modifying the +timbral characteristics of an audio signal while preserving its melodic +structure. In this paper, we propose a novel method based on dual diffusion +bridges, trained using the CocoChorales Dataset, which consists of unpaired +monophonic single-instrument audio data. Each diffusion model is trained on a +specific instrument with a Gaussian prior. During inference, a model is +designated as the source model to map the input audio to its corresponding +Gaussian prior, and another model is designated as the target model to +reconstruct the target audio from this Gaussian prior, thereby facilitating +timbre transfer. We compare our approach against existing unsupervised timbre +transfer models such as VAEGAN and Gaussian Flow Bridges (GFB). Experimental +results demonstrate that our method achieves both better Fr\'echet Audio +Distance (FAD) and melody preservation, as reflected by lower pitch distances +(DPD) compared to VAEGAN and GFB. Additionally, we discover that the noise +level from the Gaussian prior, $\sigma$, can be adjusted to control the degree +of melody preservation and amount of timbre transferred. + +
+
+
+
+
+ + ♻ ☆ DrugAgent: Explainable Drug Repurposing Agent with Large Language + Model-based Reasoning + + +
+ Drug repurposing offers a promising avenue for accelerating drug development +by identifying new therapeutic potentials of existing drugs. In this paper, we +propose a multi-agent framework to enhance the drug repurposing process using +state-of-the-art machine learning techniques and knowledge integration. Our +framework comprises several specialized agents: an AI Agent trains robust +drug-target interaction (DTI) models; a Knowledge Graph Agent utilizes the +drug-gene interaction database (DGIdb), DrugBank, Comparative Toxicogenomics +Database (CTD), and Search Tool for Interactions of Chemicals (STITCH) to +systematically extract DTIs; and a Search Agent interacts with biomedical +literature to annotate and verify computational predictions. By integrating +outputs from these agents, our system effectively harnesses diverse data +sources, including external databases, to propose viable repurposing +candidates. Preliminary results demonstrate the potential of our approach in +not only predicting drug-disease interactions but also in reducing the time and +cost associated with traditional drug discovery methods. This paper highlights +the scalability of multi-agent systems in biomedical research and their role in +driving innovation in drug repurposing. Our approach not only outperforms +existing methods in predicting drug repurposing potential but also provides +interpretable results, paving the way for more efficient and cost-effective +drug discovery processes. + +
+
+ comment: 18 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ GACL: Graph Attention Collaborative Learning for Temporal QoS Prediction + + +
+ Accurate prediction of temporal QoS is crucial for maintaining service +reliability and enhancing user satisfaction in dynamic service-oriented +environments. However, current methods often neglect high-order latent +collaborative relationships and fail to dynamically adjust feature learning for +specific user-service invocations, which are critical for precise feature +extraction within each time slice. Moreover, the prevalent use of RNNs for +modeling temporal feature evolution patterns is constrained by their inherent +difficulty in managing long-range dependencies, thereby limiting the detection +of long-term QoS trends across multiple time slices. These shortcomings +dramatically degrade the performance of temporal QoS prediction. To address the +two issues, we propose a novel Graph Attention Collaborative Learning (GACL) +framework for temporal QoS prediction. Building on a dynamic user-service +invocation graph to comprehensively model historical interactions, it designs a +target-prompt graph attention network to extract deep latent features of users +and services at each time slice, considering implicit target-neighboring +collaborative relationships and historical QoS values. Additionally, a +multi-layer Transformer encoder is introduced to uncover temporal feature +evolution patterns, enhancing temporal QoS prediction. Extensive experiments on +the WS-DREAM dataset demonstrate that GACL significantly outperforms +state-of-the-art methods for temporal QoS prediction across multiple evaluation +metrics, achieving the improvements of up to 38.80%. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+
+
+
+ + Machine Learning 12 + +
+
+
+ + ☆ Fitted Q-Iteration via Max-Plus-Linear Approximation + + +
+ In this study, we consider the application of max-plus-linear approximators +for Q-function in offline reinforcement learning of discounted Markov decision +processes. In particular, we incorporate these approximators to propose novel +fitted Q-iteration (FQI) algorithms with provable convergence. Exploiting the +compatibility of the Bellman operator with max-plus operations, we show that +the max-plus-linear regression within each iteration of the proposed FQI +algorithm reduces to simple max-plus matrix-vector multiplications. We also +consider the variational implementation of the proposed algorithm which leads +to a per-iteration complexity that is independent of the number of samples. + +
+
+
+
+
+ + ☆ Introducing CausalBench: A Flexible Benchmark Framework for Causal + Analysis and Machine Learning + + +
+ While witnessing the exceptional success of machine learning (ML) +technologies in many applications, users are starting to notice a critical +shortcoming of ML: correlation is a poor substitute for causation. The +conventional way to discover causal relationships is to use randomized +controlled experiments (RCT); in many situations, however, these are +impractical or sometimes unethical. Causal learning from observational data +offers a promising alternative. While being relatively recent, causal learning +aims to go far beyond conventional machine learning, yet several major +challenges remain. Unfortunately, advances are hampered due to the lack of +unified benchmark datasets, algorithms, metrics, and evaluation service +interfaces for causal learning. In this paper, we introduce {\em CausalBench}, +a transparent, fair, and easy-to-use evaluation platform, aiming to (a) enable +the advancement of research in causal learning by facilitating scientific +collaboration in novel algorithms, datasets, and metrics and (b) promote +scientific objectivity, reproducibility, fairness, and awareness of bias in +causal learning research. CausalBench provides services for benchmarking data, +algorithms, models, and metrics, impacting the needs of a broad of scientific +and engineering disciplines. + +
+
+
+
+
+ + ☆ Wasserstein Distributionally Robust Multiclass Support Vector Machine + + +
+ We study the problem of multiclass classification for settings where data +features $\mathbf{x}$ and their labels $\mathbf{y}$ are uncertain. We identify +that distributionally robust one-vs-all (OVA) classifiers often struggle in +settings with imbalanced data. To address this issue, we use Wasserstein +distributionally robust optimization to develop a robust version of the +multiclass support vector machine (SVM) characterized by the Crammer-Singer +(CS) loss. First, we prove that the CS loss is bounded from above by a +Lipschitz continuous function for all $\mathbf{x} \in \mathcal{X}$ and +$\mathbf{y} \in \mathcal{Y}$, then we exploit strong duality results to express +the dual of the worst-case risk problem, and we show that the worst-case risk +minimization problem admits a tractable convex reformulation due to the +regularity of the CS loss. Moreover, we develop a kernel version of our +proposed model to account for nonlinear class separation, and we show that it +admits a tractable convex upper bound. We also propose a projected subgradient +method algorithm for a special case of our proposed linear model to improve +scalability. Our numerical experiments demonstrate that our model outperforms +state-of-the art OVA models in settings where the training data is highly +imbalanced. We also show through experiments on popular real-world datasets +that our proposed model often outperforms its regularized counterpart as the +first accounts for uncertain labels unlike the latter. + +
+
+ comment: 26 pages, 7 figures +
+
+
+
+
+ + ☆ Scores as Actions: a framework of fine-tuning diffusion models by + continuous-time reinforcement learning + + +
+ Reinforcement Learning from human feedback (RLHF) has been shown a promising +direction for aligning generative models with human intent and has also been +explored in recent works for alignment of diffusion generative models. In this +work, we provide a rigorous treatment by formulating the task of fine-tuning +diffusion models, with reward functions learned from human feedback, as an +exploratory continuous-time stochastic control problem. Our key idea lies in +treating the score-matching functions as controls/actions, and upon this, we +develop a unified framework from a continuous-time perspective, to employ +reinforcement learning (RL) algorithms in terms of improving the generation +quality of diffusion models. We also develop the corresponding continuous-time +RL theory for policy optimization and regularization under assumptions of +stochastic different equations driven environment. Experiments on the +text-to-image (T2I) generation will be reported in the accompanied paper. + +
+
+
+
+
+ + ☆ Federated One-Shot Ensemble Clustering + + +
+ Cluster analysis across multiple institutions poses significant challenges +due to data-sharing restrictions. To overcome these limitations, we introduce +the Federated One-shot Ensemble Clustering (FONT) algorithm, a novel solution +tailored for multi-site analyses under such constraints. FONT requires only a +single round of communication between sites and ensures privacy by exchanging +only fitted model parameters and class labels. The algorithm combines locally +fitted clustering models into a data-adaptive ensemble, making it broadly +applicable to various clustering techniques and robust to differences in +cluster proportions across sites. Our theoretical analysis validates the +effectiveness of the data-adaptive weights learned by FONT, and simulation +studies demonstrate its superior performance compared to existing benchmark +methods. We applied FONT to identify subgroups of patients with rheumatoid +arthritis across two health systems, revealing improved consistency of patient +clusters across sites, while locally fitted clusters proved less transferable. +FONT is particularly well-suited for real-world applications with stringent +communication and privacy constraints, offering a scalable and practical +solution for multi-site clustering. + +
+
+
+
+
+ + ☆ Graphical Structural Learning of rs-fMRI data in Heavy Smokers CCS + + +
+ Recent studies revealed structural and functional brain changes in heavy +smokers. However, the specific changes in topological brain connections are not +well understood. We used Gaussian Undirected Graphs with the graphical lasso +algorithm on rs-fMRI data from smokers and non-smokers to identify significant +changes in brain connections. Our results indicate high stability in the +estimated graphs and identify several brain regions significantly affected by +smoking, providing valuable insights for future clinical research. + +
+
+ comment: Accepted by IEEE CCSB 2024 conference +
+
+
+
+
+ + ☆ Higher-Order Topological Directionality and Directed Simplicial Neural + Networks + + +
+ Topological Deep Learning (TDL) has emerged as a paradigm to process and +learn from signals defined on higher-order combinatorial topological spaces, +such as simplicial or cell complexes. Although many complex systems have an +asymmetric relational structure, most TDL models forcibly symmetrize these +relationships. In this paper, we first introduce a novel notion of higher-order +directionality and we then design Directed Simplicial Neural Networks +(Dir-SNNs) based on it. Dir-SNNs are message-passing networks operating on +directed simplicial complexes able to leverage directed and possibly asymmetric +interactions among the simplices. To our knowledge, this is the first TDL model +using a notion of higher-order directionality. We theoretically and empirically +prove that Dir-SNNs are more expressive than their directed graph counterpart +in distinguishing isomorphic directed graphs. Experiments on a synthetic source +localization task demonstrate that Dir-SNNs outperform undirected SNNs when the +underlying complex is directed, and perform comparably when the underlying +complex is undirected. + +
+
+ comment: 7 pages, 8 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Deep Learning for Network Anomaly Detection under Data Contamination: + Evaluating Robustness and Mitigating Performance Degradation + + +
+ Deep learning (DL) has emerged as a crucial tool in network anomaly detection +(NAD) for cybersecurity. While DL models for anomaly detection excel at +extracting features and learning patterns from data, they are vulnerable to +data contamination -- the inadvertent inclusion of attack-related data in +training sets presumed benign. This study evaluates the robustness of six +unsupervised DL algorithms against data contamination using our proposed +evaluation protocol. Results demonstrate significant performance degradation in +state-of-the-art anomaly detection algorithms when exposed to contaminated +data, highlighting the critical need for self-protection mechanisms in DL-based +NAD models. To mitigate this vulnerability, we propose an enhanced auto-encoder +with a constrained latent representation, allowing normal data to cluster more +densely around a learnable center in the latent space. Our evaluation reveals +that this approach exhibits improved resistance to data contamination compared +to existing methods, offering a promising direction for more robust NAD +systems. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2207.03576 +
+
+
+
+
+ + ♻ ☆ DemoStart: Demonstration-led auto-curriculum applied to sim-to-real with + multi-fingered robots + + +
+ We present DemoStart, a novel auto-curriculum reinforcement learning method +capable of learning complex manipulation behaviors on an arm equipped with a +three-fingered robotic hand, from only a sparse reward and a handful of +demonstrations in simulation. Learning from simulation drastically reduces the +development cycle of behavior generation, and domain randomization techniques +are leveraged to achieve successful zero-shot sim-to-real transfer. Transferred +policies are learned directly from raw pixels from multiple cameras and robot +proprioception. Our approach outperforms policies learned from demonstrations +on the real robot and requires 100 times fewer demonstrations, collected in +simulation. More details and videos in https://sites.google.com/view/demostart. + +
+
+ comment: 15 pages total with 7 pages of appendix. 9 Figures, 4 in the main + text and 5 in the appendix +
+
+
+
+
+ + ♻ ☆ Assessing Adversarial Robustness of Large Language Models: An Empirical + Study KDD 2024 + + +
+ Large Language Models (LLMs) have revolutionized natural language processing, +but their robustness against adversarial attacks remains a critical concern. We +presents a novel white-box style attack approach that exposes vulnerabilities +in leading open-source LLMs, including Llama, OPT, and T5. We assess the impact +of model size, structure, and fine-tuning strategies on their resistance to +adversarial perturbations. Our comprehensive evaluation across five diverse +text classification tasks establishes a new benchmark for LLM robustness. The +findings of this study have far-reaching implications for the reliable +deployment of LLMs in real-world applications and contribute to the advancement +of trustworthy AI systems. + +
+
+ comment: Oral presentation at KDD 2024 GenAI Evaluation workshop +
+
+
+
+
+ + ♻ ☆ NGD converges to less degenerate solutions than SGD + + +
+ The number of free parameters, or dimension, of a model is a straightforward +way to measure its complexity: a model with more parameters can encode more +information. However, this is not an accurate measure of complexity: models +capable of memorizing their training data often generalize well despite their +high dimension. Effective dimension aims to more directly capture the +complexity of a model by counting only the number of parameters required to +represent the functionality of the model. Singular learning theory (SLT) +proposes the learning coefficient $ \lambda $ as a more accurate measure of +effective dimension. By describing the rate of increase of the volume of the +region of parameter space around a local minimum with respect to loss, $ +\lambda $ incorporates information from higher-order terms. We compare $ +\lambda $ of models trained using natural gradient descent (NGD) and stochastic +gradient descent (SGD), and find that those trained with NGD consistently have +a higher effective dimension for both of our methods: the Hessian trace $ +\text{Tr}(\mathbf{H}) $, and the estimate of the local learning coefficient +(LLC) $ \hat{\lambda}(w^*) $. + +
+
+ comment: 8 pages, 23 figures +
+
+
+
+
+ + ♻ ☆ Biased Backpressure Routing Using Link Features and Graph Neural + Networks + + +
+ To reduce the latency of Backpressure (BP) routing in wireless multi-hop +networks, we propose to enhance the existing shortest path-biased BP (SP-BP) +and sojourn time-based backlog metrics, since they introduce no additional time +step-wise signaling overhead to the basic BP. Rather than relying on +hop-distance, we introduce a new edge-weighted shortest path bias built on the +scheduling duty cycle of wireless links, which can be predicted by a graph +convolutional neural network based on the topology and traffic of wireless +networks. Additionally, we tackle three long-standing challenges associated +with SP-BP: optimal bias scaling, efficient bias maintenance, and integration +of delay awareness. Our proposed solutions inherit the throughput optimality of +the basic BP, as well as its practical advantages of low complexity and fully +distributed implementation. Our approaches rely on common link features and +introduces only a one-time constant overhead to previous SP-BP schemes, or a +one-time overhead linear in the network size to the basic BP. Numerical +experiments show that our solutions can effectively address the major drawbacks +of slow startup, random walk, and the last packet problem in basic BP, +improving the end-to-end delay of existing low-overhead BP algorithms under +various settings of network traffic, interference, and mobility. + +
+
+ comment: 16 pages, 15 figures, accepted for publication in IEEE Transactions + on Machine Learning in Communications and Networking. arXiv admin note: text + overlap with arXiv:2310.04364, arXiv:2211.10748 +
+
+
+
+
+
+
+
+ + Multimedia 12 + +
+
+
+ + ☆ Rethinking Prompting Strategies for Multi-Label Recognition with Partial + Annotations + + +
+ Vision-language models (VLMs) like CLIP have been adapted for Multi-Label +Recognition (MLR) with partial annotations by leveraging prompt-learning, where +positive and negative prompts are learned for each class to associate their +embeddings with class presence or absence in the shared vision-text feature +space. While this approach improves MLR performance by relying on VLM priors, +we hypothesize that learning negative prompts may be suboptimal, as the +datasets used to train VLMs lack image-caption pairs explicitly focusing on +class absence. To analyze the impact of positive and negative prompt learning +on MLR, we introduce PositiveCoOp and NegativeCoOp, where only one prompt is +learned with VLM guidance while the other is replaced by an embedding vector +learned directly in the shared feature space without relying on the text +encoder. Through empirical analysis, we observe that negative prompts degrade +MLR performance, and learning only positive prompts, combined with learned +negative embeddings (PositiveCoOp), outperforms dual prompt learning +approaches. Moreover, we quantify the performance benefits that prompt-learning +offers over a simple vision-features-only baseline, observing that the baseline +displays strong performance comparable to dual prompt learning approach +(DualCoOp), when the proportion of missing labels is low, while requiring half +the training compute and 16 times fewer parameters + +
+
+
+
+
+ + ☆ FlashSplat: 2D to 3D Gaussian Splatting Segmentation Solved Optimally ECCV'2024 + + +
+ This study addresses the challenge of accurately segmenting 3D Gaussian +Splatting from 2D masks. Conventional methods often rely on iterative gradient +descent to assign each Gaussian a unique label, leading to lengthy optimization +and sub-optimal solutions. Instead, we propose a straightforward yet globally +optimal solver for 3D-GS segmentation. The core insight of our method is that, +with a reconstructed 3D-GS scene, the rendering of the 2D masks is essentially +a linear function with respect to the labels of each Gaussian. As such, the +optimal label assignment can be solved via linear programming in closed form. +This solution capitalizes on the alpha blending characteristic of the splatting +process for single step optimization. By incorporating the background bias in +our objective function, our method shows superior robustness in 3D segmentation +against noises. Remarkably, our optimization completes within 30 seconds, about +50$\times$ faster than the best existing methods. Extensive experiments +demonstrate the efficiency and robustness of our method in segmenting various +scenes, and its superior performance in downstream tasks such as object removal +and inpainting. Demos and code will be available at +https://github.com/florinshen/FlashSplat. + +
+
+ comment: ECCV'2024 +
+
+
+
+
+ + ☆ Improving Text-guided Object Inpainting with Semantic Pre-inpainting ECCV 2024 + + +
+ Recent years have witnessed the success of large text-to-image diffusion +models and their remarkable potential to generate high-quality images. The +further pursuit of enhancing the editability of images has sparked significant +interest in the downstream task of inpainting a novel object described by a +text prompt within a designated region in the image. Nevertheless, the problem +is not trivial from two aspects: 1) Solely relying on one single U-Net to align +text prompt and visual object across all the denoising timesteps is +insufficient to generate desired objects; 2) The controllability of object +generation is not guaranteed in the intricate sampling space of diffusion +model. In this paper, we propose to decompose the typical single-stage object +inpainting into two cascaded processes: 1) semantic pre-inpainting that infers +the semantic features of desired objects in a multi-modal feature space; 2) +high-fieldity object generation in diffusion latent space that pivots on such +inpainted semantic features. To achieve this, we cascade a Transformer-based +semantic inpainter and an object inpainting diffusion model, leading to a novel +CAscaded Transformer-Diffusion (CAT-Diffusion) framework for text-guided object +inpainting. Technically, the semantic inpainter is trained to predict the +semantic features of the target object conditioning on unmasked context and +text prompt. The outputs of the semantic inpainter then act as the informative +visual prompts to guide high-fieldity object generation through a reference +adapter layer, leading to controllable object inpainting. Extensive evaluations +on OpenImages-V6 and MSCOCO validate the superiority of CAT-Diffusion against +the state-of-the-art methods. Code is available at +\url{https://github.com/Nnn-s/CATdiffusion}. + +
+
+ comment: ECCV 2024. Source code is available at + https://github.com/Nnn-s/CATdiffusion +
+
+
+
+
+ + ☆ Improving Virtual Try-On with Garment-focused Diffusion Models ECCV 2024 + + +
+ Diffusion models have led to the revolutionizing of generative modeling in +numerous image synthesis tasks. Nevertheless, it is not trivial to directly +apply diffusion models for synthesizing an image of a target person wearing a +given in-shop garment, i.e., image-based virtual try-on (VTON) task. The +difficulty originates from the aspect that the diffusion process should not +only produce holistically high-fidelity photorealistic image of the target +person, but also locally preserve every appearance and texture detail of the +given garment. To address this, we shape a new Diffusion model, namely GarDiff, +which triggers the garment-focused diffusion process with amplified guidance of +both basic visual appearance and detailed textures (i.e., high-frequency +details) derived from the given garment. GarDiff first remoulds a pre-trained +latent diffusion model with additional appearance priors derived from the CLIP +and VAE encodings of the reference garment. Meanwhile, a novel garment-focused +adapter is integrated into the UNet of diffusion model, pursuing local +fine-grained alignment with the visual appearance of reference garment and +human pose. We specifically design an appearance loss over the synthesized +garment to enhance the crucial, high-frequency details. Extensive experiments +on VITON-HD and DressCode datasets demonstrate the superiority of our GarDiff +when compared to state-of-the-art VTON approaches. Code is publicly available +at: +\href{https://github.com/siqi0905/GarDiff/tree/master}{https://github.com/siqi0905/GarDiff/tree/master}. + +
+
+ comment: ECCV 2024. Source code is available at + https://github.com/siqi0905/GarDiff/tree/master +
+
+
+
+
+ + ☆ ComAlign: Compositional Alignment in Vision-Language Models + + +
+ Vision-language models (VLMs) like CLIP have showcased a remarkable ability +to extract transferable features for downstream tasks. Nonetheless, the +training process of these models is usually based on a coarse-grained +contrastive loss between the global embedding of images and texts which may +lose the compositional structure of these modalities. Many recent studies have +shown VLMs lack compositional understandings like attribute binding and +identifying object relationships. Although some recent methods have tried to +achieve finer-level alignments, they either are not based on extracting +meaningful components of proper granularity or don't properly utilize the +modalities' correspondence (especially in image-text pairs with more +ingredients). Addressing these limitations, we introduce Compositional +Alignment (ComAlign), a fine-grained approach to discover more exact +correspondence of text and image components using only the weak supervision in +the form of image-text pairs. Our methodology emphasizes that the compositional +structure (including entities and relations) extracted from the text modality +must also be retained in the image modality. To enforce correspondence of +fine-grained concepts in image and text modalities, we train a lightweight +network lying on top of existing visual and language encoders using a small +dataset. The network is trained to align nodes and edges of the structure +across the modalities. Experimental results on various VLMs and datasets +demonstrate significant improvements in retrieval and compositional benchmarks, +affirming the effectiveness of our plugin model. + +
+
+
+
+
+ + ☆ Bridging Discrete and Continuous: A Multimodal Strategy for Complex + Emotion Detection + + +
+ In the domain of human-computer interaction, accurately recognizing and +interpreting human emotions is crucial yet challenging due to the complexity +and subtlety of emotional expressions. This study explores the potential for +detecting a rich and flexible range of emotions through a multimodal approach +which integrates facial expressions, voice tones, and transcript from video +clips. We propose a novel framework that maps variety of emotions in a +three-dimensional Valence-Arousal-Dominance (VAD) space, which could reflect +the fluctuations and positivity/negativity of emotions to enable a more variety +and comprehensive representation of emotional states. We employed K-means +clustering to transit emotions from traditional discrete categorization to a +continuous labeling system and built a classifier for emotion recognition upon +this system. The effectiveness of the proposed model is evaluated using the +MER2024 dataset, which contains culturally consistent video clips from Chinese +movies and TV series, annotated with both discrete and open-vocabulary emotion +labels. Our experiment successfully achieved the transformation between +discrete and continuous models, and the proposed model generated a more diverse +and comprehensive set of emotion vocabulary while maintaining strong accuracy. + +
+
+
+
+
+ + ☆ MSMF: Multi-Scale Multi-Modal Fusion for Enhanced Stock Market + Prediction + + +
+ This paper presents MSMF (Multi-Scale Multi-Modal Fusion), a novel approach +for enhanced stock market prediction. MSMF addresses key challenges in +multi-modal stock analysis by integrating a modality completion encoder, +multi-scale feature extraction, and an innovative fusion mechanism. Our model +leverages blank learning and progressive fusion to balance complementarity and +redundancy across modalities, while multi-scale alignment facilitates direct +correlations between heterogeneous data types. We introduce Multi-Granularity +Gates and a specialized architecture to optimize the integration of local and +global information for different tasks. Additionally, a Task-targeted +Prediction layer is employed to preserve both coarse and fine-grained features +during fusion. Experimental results demonstrate that MSMF outperforms existing +methods, achieving significant improvements in accuracy and reducing prediction +errors across various stock market forecasting tasks. This research contributes +valuable insights to the field of multi-modal financial analysis and offers a +robust framework for enhanced market prediction. + +
+
+ comment: 15 pages, 1 figures, 7 tables +
+
+
+
+
+ + ☆ Bridging Paintings and Music -- Exploring Emotion based Music Generation + through Paintings + + +
+ Rapid advancements in artificial intelligence have significantly enhanced +generative tasks involving music and images, employing both unimodal and +multimodal approaches. This research develops a model capable of generating +music that resonates with the emotions depicted in visual arts, integrating +emotion labeling, image captioning, and language models to transform visual +inputs into musical compositions. Addressing the scarcity of aligned art and +music data, we curated the Emotion Painting Music Dataset, pairing paintings +with corresponding music for effective training and evaluation. Our dual-stage +framework converts images to text descriptions of emotional content and then +transforms these descriptions into music, facilitating efficient learning with +minimal data. Performance is evaluated using metrics such as Fr\'echet Audio +Distance (FAD), Total Harmonic Distortion (THD), Inception Score (IS), and KL +divergence, with audio-emotion text similarity confirmed by the pre-trained +CLAP model to demonstrate high alignment between generated music and text. This +synthesis tool bridges visual art and music, enhancing accessibility for the +visually impaired and opening avenues in educational and therapeutic +applications by providing enriched multi-sensory experiences. + +
+
+
+
+
+ + ☆ SwinGS: Sliding Window Gaussian Splatting for Volumetric Video Streaming + with Arbitrary Length + + +
+ Recent advances in 3D Gaussian Splatting (3DGS) have garnered significant +attention in computer vision and computer graphics due to its high rendering +speed and remarkable quality. While extant research has endeavored to extend +the application of 3DGS from static to dynamic scenes, such efforts have been +consistently impeded by excessive model sizes, constraints on video duration, +and content deviation. These limitations significantly compromise the +streamability of dynamic 3D Gaussian models, thereby restricting their utility +in downstream applications, including volumetric video, autonomous vehicle, and +immersive technologies such as virtual, augmented, and mixed reality. + This paper introduces SwinGS, a novel framework for training, delivering, and +rendering volumetric video in a real-time streaming fashion. To address the +aforementioned challenges and enhance streamability, SwinGS integrates +spacetime Gaussian with Markov Chain Monte Carlo (MCMC) to adapt the model to +fit various 3D scenes across frames, in the meantime employing a sliding window +captures Gaussian snapshots for each frame in an accumulative way. We implement +a prototype of SwinGS and demonstrate its streamability across various datasets +and scenes. Additionally, we develop an interactive WebGL viewer enabling +real-time volumetric video playback on most devices with modern browsers, +including smartphones and tablets. Experimental results show that SwinGS +reduces transmission costs by 83.6% compared to previous work with ignorable +compromise in PSNR. Moreover, SwinGS easily scales to long video sequences +without compromising quality. + +
+
+
+
+
+ + ☆ TMFNet: Two-Stream Multi-Channels Fusion Networks for Color Image + Operation Chain Detection + + +
+ Image operation chain detection techniques have gained increasing attention +recently in the field of multimedia forensics. However, existing detection +methods suffer from the generalization problem. Moreover, the channel +correlation of color images that provides additional forensic evidence is often +ignored. To solve these issues, in this article, we propose a novel two-stream +multi-channels fusion networks for color image operation chain detection in +which the spatial artifact stream and the noise residual stream are explored in +a complementary manner. Specifically, we first propose a novel deep residual +architecture without pooling in the spatial artifact stream for learning the +global features representation of multi-channel correlation. Then, a set of +filters is designed to aggregate the correlation information of multi-channels +while capturing the low-level features in the noise residual stream. +Subsequently, the high-level features are extracted by the deep residual model. +Finally, features from the two streams are fed into a fusion module, to +effectively learn richer discriminative representations of the operation chain. +Extensive experiments show that the proposed method achieves state-of-the-art +generalization ability while maintaining robustness to JPEG compression. The +source code used in these experiments will be released at +https://github.com/LeiTan-98/TMFNet. + +
+
+ comment: 15 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Learning Video Context as Interleaved Multimodal Sequences ECCV 2024 + + +
+ Narrative videos, such as movies, pose significant challenges in video +understanding due to their rich contexts (characters, dialogues, storylines) +and diverse demands (identify who, relationship, and reason). In this paper, we +introduce MovieSeq, a multimodal language model developed to address the wide +range of challenges in understanding video contexts. Our core idea is to +represent videos as interleaved multimodal sequences (including images, plots, +videos, and subtitles), either by linking external knowledge databases or using +offline models (such as whisper for subtitles). Through instruction-tuning, +this approach empowers the language model to interact with videos using +interleaved multimodal instructions. For example, instead of solely relying on +video as input, we jointly provide character photos alongside their names and +dialogues, allowing the model to associate these elements and generate more +comprehensive responses. To demonstrate its effectiveness, we validate +MovieSeq's performance on six datasets (LVU, MAD, Movienet, CMD, TVC, MovieQA) +across five settings (video classification, audio description, video-text +retrieval, video captioning, and video question-answering). The code will be +public at https://github.com/showlab/MovieSeq. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ An End-to-End Pipeline Perspective on Video Streaming in Best-Effort + Networks: A Survey and Tutorial + + +
+ Remaining a dominant force in Internet traffic, video streaming captivates +end users, service providers, and researchers. This paper takes a pragmatic +approach to reviewing recent advances in the field by focusing on the prevalent +streaming paradigm that involves delivering long-form two-dimensional videos +over the best-effort Internet with client-side adaptive bitrate (ABR) +algorithms and assistance from content delivery networks (CDNs). To enhance +accessibility, we supplement the survey with tutorial material. Unlike existing +surveys that offer fragmented views, our work provides a holistic perspective +on the entire end-to-end streaming pipeline, from video capture by a +camera-equipped device to playback by the end user. Our novel perspective +covers the ingestion, processing, and distribution stages of the pipeline and +addresses key challenges such as video compression, upload, transcoding, ABR +algorithms, CDN support, and quality of experience. We review over 200 papers +and classify streaming designs by their problem-solving methodology, whether +based on intuition (simple heuristics), theory (formal optimization), or +machine learning (generalizable data patterns). The survey further refines +these methodology-based categories and characterizes each design by additional +traits such as compatible codecs and use of super resolution. We connect the +reviewed research to real-world applications by discussing the practices of +commercial streaming platforms. Finally, the survey highlights prominent +current trends and outlines future directions in video streaming. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 31 + +
+
+
+ + ☆ SimulBench: Evaluating Language Models with Creative Simulation Tasks + + +
+ We introduce SimulBench, a benchmark designed to evaluate large language +models (LLMs) across a diverse collection of creative simulation scenarios, +such as acting as a Linux terminal or playing text games with users. While +these simulation tasks serve as effective measures of an LLM's general +intelligence, they are seldom incorporated into existing benchmarks. A major +challenge is to develop an evaluation framework for testing different LLMs +fairly while preserving the multi-round interactive nature of simulation tasks +between users and AI. To tackle this issue, we suggest using a fixed LLM as a +user agent to engage with an LLM to collect dialogues first under different +tasks. Then, challenging dialogue scripts are extracted for evaluating +different target LLMs. To facilitate automatic assessment on \DataName{}, GPT-4 +is employed as the evaluator, tasked with reviewing the quality of the final +response generated by the target LLMs given multi-turn dialogue scripts. Our +comprehensive experiments indicate that these simulation tasks continue to pose +a significant challenge with their unique natures and show the gap between +proprietary models and the most advanced open LLMs. For example, GPT-4-turbo +outperforms LLaMA-3-70b-Chat on 18.55\% more cases. + +
+
+ comment: Website: https://simulbench.github.io/ +
+
+
+
+
+ + ☆ Can We Count on LLMs? The Fixed-Effect Fallacy and Claims of GPT-4 + Capabilities + + +
+ In this paper we explore evaluation of LLM capabilities. We present +measurements of GPT-4 performance on several deterministic tasks; each task +involves a basic calculation and takes as input parameter some element drawn +from a large well-defined population (e.g., count elements in a list, multiply +two k-digit numbers, etc). We examine several conditions per-task and perform +enough trials so that statistically significant differences can be detected. +This allows us to investigate the sensitivity of task-accuracy both to query +phrasing and input parameter population. We find that seemingly trivial +modifications in the task-prompt or input population can yield differences far +larger than can be explained by sampling effects. For example, performance on a +simple list-counting task varies with query-phrasing and list-length, but also +with list composition (i.e., the thing-to-be-counted) and object frequency +(e.g., success when an element accounts for $\approx$ 50\% of a list is +different from when it accounts for $\approx$ 70\% etc). + We conclude that efforts to quantify LLM capabilities easily succumb to the +language-as-fixed-effect fallacy, where experimental observations are +improperly generalized beyond what the data supports. A consequence appears to +be that intuitions that have been formed based on interactions with humans form +a very unreliable guide as to which input modifications should ``make no +difference'' to LLM performance. + +
+
+
+
+
+ + ☆ Leveraging User-Generated Reviews for Recommender Systems with Dynamic + Headers ECAI + + +
+ E-commerce platforms have a vast catalog of items to cater to their +customers' shopping interests. Most of these platforms assist their customers +in the shopping process by offering optimized recommendation carousels, +designed to help customers quickly locate their desired items. Many models have +been proposed in academic literature to generate and enhance the ranking and +recall set of items in these carousels. Conventionally, the accompanying +carousel title text (header) of these carousels remains static. In most +instances, a generic text such as "Items similar to your current viewing" is +utilized. Fixed variations such as the inclusion of specific attributes "Other +items from a similar seller" or "Items from a similar brand" in addition to +"frequently bought together" or "considered together" are observed as well. +This work proposes a novel approach to customize the header generation process +of these carousels. Our work leverages user-generated reviews that lay focus on +specific attributes (aspects) of an item that were favorably perceived by users +during their interaction with the given item. We extract these aspects from +reviews and train a graph neural network-based model under the framework of a +conditional ranking task. We refer to our innovative methodology as Dynamic +Text Snippets (DTS) which generates multiple header texts for an anchor item +and its recall set. Our approach demonstrates the potential of utilizing +user-generated reviews and presents a unique paradigm for exploring +increasingly context-aware recommendation systems. + +
+
+ comment: 7 pages, 3 figures, PAIS 2024 (ECAI) +
+
+
+
+
+ + ☆ Zero-Shot Machine-Generated Text Detection Using Mixture of Large + Language Models + + +
+ The dissemination of Large Language Models (LLMs), trained at scale, and +endowed with powerful text-generating abilities has vastly increased the +threats posed by generative AI technologies by reducing the cost of producing +harmful, toxic, faked or forged content. In response, various proposals have +been made to automatically discriminate artificially generated from +human-written texts, typically framing the problem as a classification problem. +Most approaches evaluate an input document by a well-chosen detector LLM, +assuming that low-perplexity scores reliably signal machine-made content. As +using one single detector can induce brittleness of performance, we instead +consider several and derive a new, theoretically grounded approach to combine +their respective strengths. Our experiments, using a variety of generator LLMs, +suggest that our method effectively increases the robustness of detection. + +
+
+ comment: Preprint, work in progress +
+
+
+
+
+ + ☆ SUPER: Evaluating Agents on Setting Up and Executing Tasks from Research + Repositories + + +
+ Given that Large Language Models (LLMs) have made significant progress in +writing code, can they now be used to autonomously reproduce results from +research repositories? Such a capability would be a boon to the research +community, helping researchers validate, understand, and extend prior work. To +advance towards this goal, we introduce SUPER, the first benchmark designed to +evaluate the capability of LLMs in setting up and executing tasks from research +repositories. SUPERaims to capture the realistic challenges faced by +researchers working with Machine Learning (ML) and Natural Language Processing +(NLP) research repositories. Our benchmark comprises three distinct problem +sets: 45 end-to-end problems with annotated expert solutions, 152 sub problems +derived from the expert set that focus on specific challenges (e.g., +configuring a trainer), and 602 automatically generated problems for +larger-scale development. We introduce various evaluation measures to assess +both task success and progress, utilizing gold solutions when available or +approximations otherwise. We show that state-of-the-art approaches struggle to +solve these problems with the best model (GPT-4o) solving only 16.3% of the +end-to-end set, and 46.1% of the scenarios. This illustrates the challenge of +this task, and suggests that SUPER can serve as a valuable resource for the +community to make and measure progress. + +
+
+
+
+
+ + ☆ A Suite for Acoustic Language Model Evaluation + + +
+ Speech language models have recently demonstrated great potential as +universal speech processing systems. Such models have the ability to model the +rich acoustic information existing in audio signals, beyond spoken content, +such as emotion, background noise, etc. Despite this, evaluation benchmarks +which evaluate awareness to a wide range of acoustic aspects, are lacking. To +help bridge this gap, we introduce SALMon, a novel evaluation suite +encompassing background noise, emotion, speaker identity and room impulse +response. The proposed benchmarks both evaluate the consistency of the +inspected element and how much it matches the spoken text. We follow a +modelling based approach, measuring whether a model gives correct samples +higher scores than incorrect ones. This approach makes the benchmark fast to +compute even for large models. We evaluated several speech language models on +SALMon, thus highlighting the strengths and weaknesses of each evaluated +method. Code and data are publicly available at +https://pages.cs.huji.ac.il/adiyoss-lab/salmon/ . + +
+
+
+
+
+ + ☆ Synthetic continued pretraining + + +
+ Pretraining on large-scale, unstructured internet text has enabled language +models to acquire a significant amount of world knowledge. However, this +knowledge acquisition is data-inefficient -- to learn a given fact, models must +be trained on hundreds to thousands of diverse representations of it. This +poses a challenge when adapting a pretrained model to a small corpus of +domain-specific documents, where each fact may appear rarely or only once. We +propose to bridge this gap with synthetic continued pretraining: using the +small domain-specific corpus to synthesize a large corpus more amenable to +learning, and then performing continued pretraining on the synthesized corpus. +We instantiate this proposal with EntiGraph, a synthetic data augmentation +algorithm that extracts salient entities from the source documents and then +generates diverse text by drawing connections between the sampled entities. +Synthetic continued pretraining using EntiGraph enables a language model to +answer questions and follow generic instructions related to the source +documents without access to them. If instead, the source documents are +available at inference time, we show that the knowledge acquired through our +approach compounds with retrieval-augmented generation. To better understand +these results, we build a simple mathematical model of EntiGraph, and show how +synthetic data augmentation can "rearrange" knowledge to enable more +data-efficient learning. + +
+
+
+
+
+ + ☆ Agent Workflow Memory + + +
+ Despite the potential of language model-based agents to solve real-world +tasks such as web navigation, current methods still struggle with long-horizon +tasks with complex action trajectories. In contrast, humans can flexibly solve +complex tasks by learning reusable task workflows from past experiences and +using them to guide future actions. To build agents that can similarly benefit +from this process, we introduce Agent Workflow Memory (AWM), a method for +inducing commonly reused routines, i.e., workflows, and selectively providing +workflows to the agent to guide subsequent generations. AWM flexibly applies to +both offline and online scenarios, where agents induce workflows from training +examples beforehand or from test queries on the fly. We experiment on two major +web navigation benchmarks -- Mind2Web and WebArena -- that collectively cover +1000+ tasks from 200+ domains across travel, shopping, and social media, among +others. AWM substantially improves the baseline results by 24.6% and 51.1% +relative success rate on Mind2Web and WebArena while reducing the number of +steps taken to solve WebArena tasks successfully. Furthermore, online AWM +robustly generalizes in cross-task, website, and domain evaluations, surpassing +baselines from 8.9 to 14.0 absolute points as train-test task distribution gaps +widen. + +
+
+
+
+
+ + ☆ Towards Fairer Health Recommendations: finding informative unbiased + samples via Word Sense Disambiguation + + +
+ There have been growing concerns around high-stake applications that rely on +models trained with biased data, which consequently produce biased predictions, +often harming the most vulnerable. In particular, biased medical data could +cause health-related applications and recommender systems to create outputs +that jeopardize patient care and widen disparities in health outcomes. A recent +framework titled Fairness via AI posits that, instead of attempting to correct +model biases, researchers must focus on their root causes by using AI to debias +data. Inspired by this framework, we tackle bias detection in medical curricula +using NLP models, including LLMs, and evaluate them on a gold standard dataset +containing 4,105 excerpts annotated by medical experts for bias from a large +corpus. We build on previous work by coauthors which augments the set of +negative samples with non-annotated text containing social identifier terms. +However, some of these terms, especially those related to race and ethnicity, +can carry different meanings (e.g., "white matter of spinal cord"). To address +this issue, we propose the use of Word Sense Disambiguation models to refine +dataset quality by removing irrelevant sentences. We then evaluate fine-tuned +variations of BERT models as well as GPT models with zero- and few-shot +prompting. We found LLMs, considered SOTA on many NLP tasks, unsuitable for +bias detection, while fine-tuned BERT models generally perform well across all +evaluated metrics. + +
+
+ comment: Accepted for long presentation at the FAcctRec @ Recsys 2024 +
+
+
+
+
+ + ☆ Enhancing adversarial robustness in Natural Language Inference using + explanations + + +
+ The surge of state-of-the-art Transformer-based models has undoubtedly pushed +the limits of NLP model performance, excelling in a variety of tasks. We cast +the spotlight on the underexplored task of Natural Language Inference (NLI), +since models trained on popular well-suited datasets are susceptible to +adversarial attacks, allowing subtle input interventions to mislead the model. +In this work, we validate the usage of natural language explanation as a +model-agnostic defence strategy through extensive experimentation: only by +fine-tuning a classifier on the explanation rather than premise-hypothesis +inputs, robustness under various adversarial attacks is achieved in comparison +to explanation-free baselines. Moreover, since there is no standard strategy of +testing the semantic validity of the generated explanations, we research the +correlation of widely used language generation metrics with human perception, +in order for them to serve as a proxy towards robust NLI models. Our approach +is resource-efficient and reproducible without significant computational +limitations. + +
+
+
+
+
+ + ☆ What to align in multimodal contrastive learning? + + +
+ Humans perceive the world through multisensory integration, blending the +information of different modalities to adapt their behavior. Contrastive +learning offers an appealing solution for multimodal self-supervised learning. +Indeed, by considering each modality as a different view of the same entity, it +learns to align features of different modalities in a shared representation +space. However, this approach is intrinsically limited as it only learns shared +or redundant information between modalities, while multimodal interactions can +arise in other ways. In this work, we introduce CoMM, a Contrastive MultiModal +learning strategy that enables the communication between modalities in a single +multimodal space. Instead of imposing cross- or intra- modality constraints, we +propose to align multimodal representations by maximizing the mutual +information between augmented versions of these multimodal features. Our +theoretical analysis shows that shared, synergistic and unique terms of +information naturally emerge from this formulation, allowing us to estimate +multimodal interactions beyond redundancy. We test CoMM both in a controlled +and in a series of real-world settings: in the former, we demonstrate that CoMM +effectively captures redundant, unique and synergistic information between +modalities. In the latter, CoMM learns complex multimodal interactions and +achieves state-of-the-art results on the six multimodal benchmarks. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ AdaCAD: Adaptively Decoding to Balance Conflicts between Contextual and + Parametric Knowledge + + +
+ Knowledge conflict arises from discrepancies between information in the +context of a large language model (LLM) and the knowledge stored in its +parameters. This can hurt performance when using standard decoding techniques, +which tend to ignore the context. Existing test-time contrastive methods seek +to address this by comparing the LLM's output distribution with and without the +context and adjust the model according to the contrast between them. However, +we find that these methods frequently misjudge the degree of conflict and +struggle to handle instances that vary in their amount of conflict, with static +methods over-adjusting when conflict is absent. We propose a fine-grained, +instance-level approach called AdaCAD, which dynamically infers the weight of +adjustment based on the degree of conflict, as measured by the Jensen-Shannon +divergence between distributions representing contextual and parametric +knowledge. Our experiments across four models on six diverse question-answering +(QA) datasets and three summarization tasks demonstrate that our training-free +adaptive method consistently outperforms other decoding methods on QA, with +average accuracy gains of 14.21% (absolute) over a static contrastive baseline, +and improves the factuality of summaries by 5.59 (AlignScore). Furthermore, our +analysis shows that while decoding with contrastive baselines hurts performance +when conflict is absent, AdaCAD mitigates these losses, making it more +applicable to real-world datasets in which some examples have conflict and +others do not. + +
+
+ comment: 16 pages, Code: https://github.com/HanNight/AdaCAD +
+
+
+
+
+ + ☆ Recent Trends of Multimodal Affective Computing: A Survey from NLP + Perspective + + +
+ Multimodal affective computing (MAC) has garnered increasing attention due to +its broad applications in analyzing human behaviors and intentions, especially +in text-dominated multimodal affective computing field. This survey presents +the recent trends of multimodal affective computing from NLP perspective +through four hot tasks: multimodal sentiment analysis, multimodal emotion +recognition in conversation, multimodal aspect-based sentiment analysis and +multimodal multi-label emotion recognition. The goal of this survey is to +explore the current landscape of multimodal affective research, identify +development trends, and highlight the similarities and differences across +various tasks, offering a comprehensive report on the recent progress in +multimodal affective computing from an NLP perspective. This survey covers the +formalization of tasks, provides an overview of relevant works, describes +benchmark datasets, and details the evaluation metrics for each task. +Additionally, it briefly discusses research in multimodal affective computing +involving facial expressions, acoustic signals, physiological signals, and +emotion causes. Additionally, we discuss the technical approaches, challenges, +and future directions in multimodal affective computing. To support further +research, we released a repository that compiles related works in multimodal +affective computing, providing detailed resources and references for the +community. + +
+
+
+
+
+ + ☆ Awaking the Slides: A Tuning-free and Knowledge-regulated AI Tutoring + System via Language Model Coordination + + +
+ The vast pre-existing slides serve as rich and important materials to carry +lecture knowledge. However, effectively leveraging lecture slides to serve +students is difficult due to the multi-modal nature of slide content and the +heterogeneous teaching actions. We study the problem of discovering effective +designs that convert a slide into an interactive lecture. We develop +Slide2Lecture, a tuning-free and knowledge-regulated intelligent tutoring +system that can (1) effectively convert an input lecture slide into a +structured teaching agenda consisting of a set of heterogeneous teaching +actions; (2) create and manage an interactive lecture that generates responsive +interactions catering to student learning demands while regulating the +interactions to follow teaching actions. Slide2Lecture contains a complete +pipeline for learners to obtain an interactive classroom experience to learn +the slide. For teachers and developers, Slide2Lecture enables customization to +cater to personalized demands. The evaluation rated by annotators and students +shows that Slide2Lecture is effective in outperforming the remaining +implementation. Slide2Lecture's online deployment has made more than 200K +interaction with students in the 3K lecture sessions. We open source +Slide2Lecture's implementation in +https://anonymous.4open.science/r/slide2lecture-4210/. + +
+
+
+
+
+ + ☆ Think Together and Work Better: Combining Humans' and LLMs' Think-Aloud + Outcomes for Effective Text Evaluation + + +
+ This study introduces \textbf{InteractEval}, a framework that integrates +human expertise and Large Language Models (LLMs) using the Think-Aloud (TA) +method to generate attributes for checklist-based text evaluation. By combining +human flexibility and reasoning with LLM consistency, InteractEval outperforms +traditional non-LLM-based and LLM-based baselines across four distinct +dimensions, consisting of Coherence, Fluency, Consistency, and Relevance. The +experiment also investigates the effectiveness of the TA method, showing that +it promotes divergent thinking in both humans and LLMs, leading to the +generation of a wider range of relevant attributes and enhance text evaluation +performance. Comparative analysis reveals that humans excel at identifying +attributes related to internal quality (Coherence and Fluency), but LLMs +perform better at those attributes related to external alignment (Consistency +and Relevance). Consequently, leveraging both humans and LLMs together produces +the best evaluation outcomes. In other words, this study emphasizes the +necessity of effectively combining humans and LLMs in an automated +checklist-based text evaluation framework. The code is available at +\textbf{\url{https://github.com/BBeeChu/InteractEval.git}}. + +
+
+
+
+
+ + ☆ Explanation, Debate, Align: A Weak-to-Strong Framework for Language + Model Generalization + + +
+ The rapid advancement of artificial intelligence systems has brought the +challenge of AI alignment to the forefront of research, particularly in complex +decision-making and task execution. As these systems surpass human-level +performance in sophisticated problems, ensuring their alignment with human +values, intentions, and ethical guidelines becomes crucial. Building on +previous work in explanation generation for human-agent alignment, we address +the more complex dynamics of multi-agent systems and human-AI teams. This paper +introduces a novel approach to model alignment through weak-to-strong +generalization in the context of language models. We present a framework where +a strong model facilitates the improvement of a weaker model, bridging the gap +between explanation generation and model alignment. Our method, formalized as a +facilitation function, allows for the transfer of capabilities from advanced +models to less capable ones without direct access to extensive training data. +Our results suggest that this facilitation-based approach not only enhances +model performance but also provides insights into the nature of model alignment +and the potential for scalable oversight of AI systems. + +
+
+
+
+
+ + ☆ MEDIC: Towards a Comprehensive Framework for Evaluating LLMs in Clinical + Applications + + +
+ The rapid development of Large Language Models (LLMs) for healthcare +applications has spurred calls for holistic evaluation beyond frequently-cited +benchmarks like USMLE, to better reflect real-world performance. While +real-world assessments are valuable indicators of utility, they often lag +behind the pace of LLM evolution, likely rendering findings obsolete upon +deployment. This temporal disconnect necessitates a comprehensive upfront +evaluation that can guide model selection for specific clinical applications. +We introduce MEDIC, a framework assessing LLMs across five critical dimensions +of clinical competence: medical reasoning, ethics and bias, data and language +understanding, in-context learning, and clinical safety. MEDIC features a novel +cross-examination framework quantifying LLM performance across areas like +coverage and hallucination detection, without requiring reference outputs. We +apply MEDIC to evaluate LLMs on medical question-answering, safety, +summarization, note generation, and other tasks. Our results show performance +disparities across model sizes, baseline vs medically finetuned models, and +have implications on model selection for applications requiring specific model +strengths, such as low hallucination or lower cost of inference. MEDIC's +multifaceted evaluation reveals these performance trade-offs, bridging the gap +between theoretical capabilities and practical implementation in healthcare +settings, ensuring that the most promising models are identified and adapted +for diverse healthcare applications. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Using Generative Agents to Create Tip Sheets for Investigative Data + Reporting + + +
+ This paper introduces a system using generative AI agents to create tip +sheets for investigative data reporting. Our system employs three specialized +agents--an analyst, a reporter, and an editor--to collaboratively generate and +refine tips from datasets. We validate this approach using real-world +investigative stories, demonstrating that our agent-based system generally +generates more newsworthy and accurate insights compared to a baseline model +without agents, although some variability was noted between different stories. +Our findings highlight the potential of generative AI to provide leads for +investigative data reporting. + +
+
+ comment: Short paper to be presented at Computation + Journalism 2024 +
+
+
+
+
+ + ☆ Cross-Dialect Text-To-Speech in Pitch-Accent Language Incorporating + Multi-Dialect Phoneme-Level BERT + + +
+ We explore cross-dialect text-to-speech (CD-TTS), a task to synthesize +learned speakers' voices in non-native dialects, especially in pitch-accent +languages. CD-TTS is important for developing voice agents that naturally +communicate with people across regions. We present a novel TTS model comprising +three sub-modules to perform competitively at this task. We first train a +backbone TTS model to synthesize dialect speech from a text conditioned on +phoneme-level accent latent variables (ALVs) extracted from speech by a +reference encoder. Then, we train an ALV predictor to predict ALVs tailored to +a target dialect from input text leveraging our novel multi-dialect +phoneme-level BERT. We conduct multi-dialect TTS experiments and evaluate the +effectiveness of our model by comparing it with a baseline derived from +conventional dialect TTS methods. The results show that our model improves the +dialectal naturalness of synthetic speech in CD-TTS. + +
+
+ comment: Accepted by IEEE SLT 2024 +
+
+
+
+
+ + ☆ Propaganda to Hate: A Multimodal Analysis of Arabic Memes with + Multi-Agent LLMs + + +
+ In the past decade, social media platforms have been used for information +dissemination and consumption. While a major portion of the content is posted +to promote citizen journalism and public awareness, some content is posted to +mislead users. Among different content types such as text, images, and videos, +memes (text overlaid on images) are particularly prevalent and can serve as +powerful vehicles for propaganda, hate, and humor. In the current literature, +there have been efforts to individually detect such content in memes. However, +the study of their intersection is very limited. In this study, we explore the +intersection between propaganda and hate in memes using a multi-agent LLM-based +approach. We extend the propagandistic meme dataset with coarse and +fine-grained hate labels. Our finding suggests that there is an association +between propaganda and hate in memes. We provide detailed experimental results +that can serve as a baseline for future studies. We will make the experimental +resources publicly available to the community. + +
+
+ comment: propaganda, hate-speech, disinformation, misinformation, fake news, + LLMs, GPT-4, multimodality, multimodal LLMs +
+
+
+
+
+ + ♻ ☆ Continual Skill and Task Learning via Dialogue + + +
+ Continual and interactive robot learning is a challenging problem as the +robot is present with human users who expect the robot to learn novel skills to +solve novel tasks perpetually with sample efficiency. In this work we present a +framework for robots to query and learn visuo-motor robot skills and task +relevant information via natural language dialog interactions with human users. +Previous approaches either focus on improving the performance of instruction +following agents, or passively learn novel skills or concepts. Instead, we used +dialog combined with a language-skill grounding embedding to query or confirm +skills and/or tasks requested by a user. To achieve this goal, we developed and +integrated three different components for our agent. Firstly, we propose a +novel visual-motor control policy ACT with Low Rank Adaptation (ACT-LoRA), +which enables the existing SoTA ACT model to perform few-shot continual +learning. Secondly, we develop an alignment model that projects demonstrations +across skill embodiments into a shared embedding allowing us to know when to +ask questions and/or demonstrations from users. Finally, we integrated an +existing LLM to interact with a human user to perform grounded interactive +continual skill learning to solve a task. Our ACT-LoRA model learns novel +fine-tuned skills with a 100% accuracy when trained with only five +demonstrations for a novel skill while still maintaining a 74.75% accuracy on +pre-trained skills in the RLBench dataset where other models fall significantly +short. We also performed a human-subjects study with 8 subjects to demonstrate +the continual learning capabilities of our combined framework. We achieve a +success rate of 75% in the task of sandwich making with the real robot learning +from participant data demonstrating that robots can learn novel skills or task +knowledge from dialogue with non-expert users using our approach. + +
+
+
+
+
+ + ♻ ☆ Can ChatGPT Forecast Stock Price Movements? Return Predictability and + Large Language Models + + +
+ We document the capability of large language models (LLMs) like ChatGPT to +predict stock price movements using news headlines, even without direct +financial training. ChatGPT scores significantly predict out-of-sample daily +stock returns, subsuming traditional methods, and predictability is stronger +among smaller stocks and following negative news. To explain these findings, we +develop a theoretical model incorporating information capacity constraints, +underreaction, limits-to-arbitrage, and LLMs. The model generates several key +predictions, which we empirically test: (i) it establishes a critical threshold +in AI capabilities necessary for profitable predictions, (ii) it demonstrates +that only advanced LLMs can effectively interpret complex information, and +(iii) it predicts that widespread LLM adoption can enhance market efficiency. +Our results suggest that sophisticated return forecasting is an emerging +capability of AI systems and that these technologies can alter information +diffusion and decision-making processes in financial markets. Finally, we +introduce an interpretability framework to evaluate LLMs' reasoning, +contributing to AI transparency and economic decision-making. + +
+
+ comment: Previously posted in SSRN + https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4412788 +
+
+
+
+
+ + ♻ ☆ Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate + Scheduler + + +
+ Finding the optimal learning rate for language model pretraining is a +challenging task. This is not only because there is a complicated correlation +between learning rate, batch size, number of training tokens, model size, and +other hyperparameters but also because it is prohibitively expensive to perform +a hyperparameter search for large language models with Billions or Trillions of +parameters. Recent studies propose using small proxy models and small corpus to +perform hyperparameter searches and transposing the optimal parameters to large +models and large corpus. While the zero-shot transferability is theoretically +and empirically proven for model size related hyperparameters, like depth and +width, the zero-shot transfer from small corpus to large corpus is +underexplored. In this paper, we study the correlation between optimal learning +rate, batch size, and number of training tokens for the recently proposed WSD +scheduler. After thousands of small experiments, we found a power-law +relationship between variables and demonstrated its transferability across +model sizes. Based on the observation, we propose a new learning rate +scheduler, Power scheduler, that is agnostic about the number of training +tokens and batch size. The experiment shows that combining the Power scheduler +with Maximum Update Parameterization (muP) can consistently achieve impressive +performance with one set of hyperparameters regardless of the number of +training tokens, batch size, model size, and even model architecture. Our 3B +dense and MoE models trained with the Power scheduler achieve comparable +performance as state-of-the-art small language models. We open-source these +pretrained models at https://ibm.biz/BdKhLa. + +
+
+
+
+
+ + ♻ ☆ LongGenbench: Benchmarking Long-Form Generation in Long Context LLMs + + +
+ The abilities of long-context language models (LMs) are often evaluated using +the "Needle-in-a-Haystack" (NIAH) test, which comprises tasks designed to +assess a model's ability to identify specific information ("needle") within +large text sequences ("haystack"). While these benchmarks measure how well +models understand long-context input sequences, they do not effectively gauge +the quality of long-form text generation--a critical aspect for applications +such as design proposals and creative writing. To address this gap, we have +introduced a new long-form text evaluation benchmark, LongGenbench, which tests +models' ability to identify specific events within generated long text +sequences. In this benchmark, we prompt long-context LMs to create long-form +text that must include particular events or constraints and evaluate their +ability to incorporate these elements. We evaluated ten long-context LMs across +four distinct scenarios, three types of prompt instructions, and two different +generation-length settings (16K and 32K). Although these models perform well on +NIAH benchmarks, none demonstrated satisfactory performance on the +LongGenbench, raising concerns about their ability to generate coherent +long-form text that follows instructions. Additionally, as the length of the +generated text increases, all models exhibit a significant drop in performance. + +
+
+ comment: work in progress. arXiv admin note: text overlap with + arXiv:2404.06654 by other authors +
+
+
+
+
+ + ♻ ☆ Explaining Text Classifiers with Counterfactual Representations ECAI 2024 + + +
+ One well motivated explanation method for classifiers leverages +counterfactuals which are hypothetical events identical to real observations in +all aspects except for one feature. Constructing such counterfactual poses +specific challenges for texts, however, as some attribute values may not +necessarily align with plausible real-world events. In this paper we propose a +simple method for generating counterfactuals by intervening in the space of +text representations which bypasses this limitation. We argue that our +interventions are minimally disruptive and that they are theoretically sound as +they align with counterfactuals as defined in Pearl's causal inference +framework. To validate our method, we conducted experiments first on a +synthetic dataset and then on a realistic dataset of counterfactuals. This +allows for a direct comparison between classifier predictions based on ground +truth counterfactuals - obtained through explicit text interventions - and our +counterfactuals, derived through interventions in the representation space. +Eventually, we study a real world scenario where our counterfactuals can be +leveraged both for explaining a classifier and for bias mitigation. + +
+
+ comment: 24 pages, 4 figures, accepted for publication in ECAI 2024 +
+
+
+
+
+ + ♻ ☆ Learning to Generate Instruction Tuning Datasets for Zero-Shot Task + Adaptation ACL + + +
+ We introduce Bonito, an open-source model for conditional task generation +that converts unannotated text into task-specific training datasets for +instruction tuning. We aim to enable zero-shot task adaptation of large +language models on users' specialized, private data. We train Bonito by +fine-tuning a pretrained large language model on a new large-scale dataset with +1.65M examples created by remixing existing instruction tuning datasets into +meta-templates. The meta-templates for a dataset produce training examples +where the input is the unannotated text and the task attribute and the output +consists of the instruction and the response. We use Bonito to generate +synthetic tasks for seven datasets from specialized domains with unannotated +text across three task types -- yes-no question answering, extractive question +answering, and natural language inference -- and adapt language models. We show +that Bonito significantly improves the average performance of pretrained and +instruction tuned models over the de facto self supervised baseline. For +example, adapting Mistral-Instruct-v2 and instruction tuned variants of Mistral +and Llama2 with Bonito improves the strong zero-shot performance by 22.1 F1 +points whereas the next word prediction objective undoes some of the benefits +of instruction tuning and reduces the average performance by 0.8 F1 points. We +conduct additional experiments with Bonito to understand the effects of the +domain, the size of the training set, and the choice of alternative synthetic +task generators. Overall, we show that learning with synthetic instruction +tuning datasets is an effective way to adapt language models to new domains. +The model, dataset, and code are available at +https://github.com/BatsResearch/bonito. + +
+
+ comment: ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ CriticEval: Evaluating Large Language Model as Critic + + +
+ Critique ability, i.e., the capability of Large Language Models (LLMs) to +identify and rectify flaws in responses, is crucial for their applications in +self-improvement and scalable oversight. While numerous studies have been +proposed to evaluate critique ability of LLMs, their comprehensiveness and +reliability are still limited. To overcome this problem, we introduce +CriticEval, a novel benchmark designed to comprehensively and reliably evaluate +critique ability of LLMs. Specifically, to ensure the comprehensiveness, +CriticEval evaluates critique ability from four dimensions across nine diverse +task scenarios. It evaluates both scalar-valued and textual critiques, +targeting responses of varying quality. To ensure the reliability, a large +number of critiques are annotated to serve as references, enabling GPT-4 to +evaluate textual critiques reliably. Extensive evaluations of open-source and +closed-source LLMs first validate the reliability of evaluation in CriticEval. +Then, experimental results demonstrate the promising potential of open-source +LLMs, the effectiveness of critique datasets and several intriguing +relationships between the critique ability and some critical factors, including +task types, response qualities and critique dimensions. Datasets and evaluation +toolkit for CriticEval will be publicly released. + +
+
+
+
+
+ + ♻ ☆ Explainable Identification of Hate Speech towards Islam using Graph + Neural Networks NeurIPS 2023 + + +
+ Islamophobic language on online platforms fosters intolerance, making +detection and elimination crucial for promoting harmony. Traditional hate +speech detection models rely on NLP techniques like tokenization, +part-of-speech tagging, and encoder-decoder models. However, Graph Neural +Networks (GNNs), with their ability to utilize relationships between data +points, offer more effective detection and greater explainability. In this +work, we represent speeches as nodes and connect them with edges based on their +context and similarity to develop the graph. This study introduces a novel +paradigm using GNNs to identify and explain hate speech towards Islam. Our +model leverages GNNs to understand the context and patterns of hate speech by +connecting texts via pretrained NLP-generated word embeddings, achieving +state-of-the-art performance and enhancing detection accuracy while providing +valuable explanations. This highlights the potential of GNNs in combating +online hate speech and fostering a safer, more inclusive online environment. + +
+
+ comment: Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (non-archival) + (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP + 2024 : NLP for Positive Impact Workshop (archival) +
+
+
+
+
+ + ♻ ☆ AEGIS: Online Adaptive AI Content Safety Moderation with Ensemble of LLM + Experts + + +
+ As Large Language Models (LLMs) and generative AI become more widespread, the +content safety risks associated with their use also increase. We find a notable +deficiency in high-quality content safety datasets and benchmarks that +comprehensively cover a wide range of critical safety areas. To address this, +we define a broad content safety risk taxonomy, comprising 13 critical risk and +9 sparse risk categories. Additionally, we curate AEGISSAFETYDATASET, a new +dataset of approximately 26, 000 human-LLM interaction instances, complete with +human annotations adhering to the taxonomy. We plan to release this dataset to +the community to further research and to help benchmark LLM models for safety. +To demonstrate the effectiveness of the dataset, we instruction-tune multiple +LLM-based safety models. We show that our models (named AEGISSAFETYEXPERTS), +not only surpass or perform competitively with the state-of-the-art LLM-based +safety models and general purpose LLMs, but also exhibit robustness across +multiple jail-break attack categories. We also show how using +AEGISSAFETYDATASET during the LLM alignment phase does not negatively impact +the performance of the aligned models on MT Bench scores. Furthermore, we +propose AEGIS, a novel application of a no-regret online adaptation framework +with strong theoretical guarantees, to perform content moderation with an +ensemble of LLM content safety experts in deployment + +
+
+
+
+
+ + ♻ ☆ Leveraging Cross-Lingual Transfer Learning in Spoken Named Entity + Recognition Systems + + +
+ Recent Named Entity Recognition (NER) advancements have significantly +enhanced text classification capabilities. This paper focuses on spoken NER, +aimed explicitly at spoken document retrieval, an area not widely studied due +to the lack of comprehensive datasets for spoken contexts. Additionally, the +potential for cross-lingual transfer learning in low-resource situations +deserves further investigation. In our study, we applied transfer learning +techniques across Dutch, English, and German using both pipeline and End-to-End +(E2E) approaches. We employed Wav2Vec2 XLS-R models on custom pseudo-annotated +datasets to evaluate the adaptability of cross-lingual systems. Our exploration +of different architectural configurations assessed the robustness of these +systems in spoken NER. Results showed that the E2E model was superior to the +pipeline model, particularly with limited annotation resources. Furthermore, +transfer learning from German to Dutch improved performance by 7% over the +standalone Dutch E2E system and 4% over the Dutch pipeline model. Our findings +highlight the effectiveness of cross-lingual transfer in spoken NER and +emphasize the need for additional data collection to improve these systems. + +
+
+
+
+
+ + ♻ ☆ CLIBE: Detecting Dynamic Backdoors in Transformer-based NLP Models NDSS + + +
+ Backdoors can be injected into NLP models to induce misbehavior when the +input text contains a specific feature, known as a trigger, which the attacker +secretly selects. Unlike fixed words, phrases, or sentences used in the static +text trigger, NLP dynamic backdoor attacks design triggers associated with +abstract and latent text features, making them considerably stealthier than +traditional static backdoor attacks. However, existing research on NLP backdoor +detection primarily focuses on defending against static backdoor attacks, while +detecting dynamic backdoors in NLP models remains largely unexplored. This +paper presents CLIBE, the first framework to detect dynamic backdoors in +Transformer-based NLP models. CLIBE injects a "few-shot perturbation" into the +suspect Transformer model by crafting optimized weight perturbation in the +attention layers to make the perturbed model classify a limited number of +reference samples as a target label. Subsequently, CLIBE leverages the +generalization ability of this few-shot perturbation to determine whether the +original model contains a dynamic backdoor. Extensive evaluation on three +advanced NLP dynamic backdoor attacks, two widely-used Transformer frameworks, +and four real-world classification tasks strongly validates the effectiveness +of CLIBE. We also demonstrate the robustness of CLIBE against various adaptive +attacks. Furthermore, we employ CLIBE to scrutinize 49 popular Transformer +models on Hugging Face and discover one exhibiting a high probability of +containing a dynamic backdoor. We have contacted Hugging Face and provided +detailed evidence of this model's backdoor behavior. Moreover, we extend CLIBE +to detect backdoor text generation models modified to exhibit toxic behavior. +To the best of our knowledge, CLIBE is the first framework capable of detecting +backdoors in text generation models without access to trigger input test +samples. + +
+
+ comment: To appear in the Network and Distributed System Security (NDSS) + Symposium, February, 2025 +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Leveraging User-Generated Reviews for Recommender Systems with Dynamic + Headers ECAI + + +
+ E-commerce platforms have a vast catalog of items to cater to their +customers' shopping interests. Most of these platforms assist their customers +in the shopping process by offering optimized recommendation carousels, +designed to help customers quickly locate their desired items. Many models have +been proposed in academic literature to generate and enhance the ranking and +recall set of items in these carousels. Conventionally, the accompanying +carousel title text (header) of these carousels remains static. In most +instances, a generic text such as "Items similar to your current viewing" is +utilized. Fixed variations such as the inclusion of specific attributes "Other +items from a similar seller" or "Items from a similar brand" in addition to +"frequently bought together" or "considered together" are observed as well. +This work proposes a novel approach to customize the header generation process +of these carousels. Our work leverages user-generated reviews that lay focus on +specific attributes (aspects) of an item that were favorably perceived by users +during their interaction with the given item. We extract these aspects from +reviews and train a graph neural network-based model under the framework of a +conditional ranking task. We refer to our innovative methodology as Dynamic +Text Snippets (DTS) which generates multiple header texts for an anchor item +and its recall set. Our approach demonstrates the potential of utilizing +user-generated reviews and presents a unique paradigm for exploring +increasingly context-aware recommendation systems. + +
+
+ comment: 7 pages, 3 figures, PAIS 2024 (ECAI) +
+
+
+
+
+ + ☆ Multilingual Prompts in LLM-Based Recommenders: Performance Across + Languages + + +
+ Large language models (LLMs) are increasingly used in natural language +processing tasks. Recommender systems traditionally use methods such as +collaborative filtering and matrix factorization, as well as advanced +techniques like deep learning and reinforcement learning. Although language +models have been applied in recommendation, the recent trend have focused on +leveraging the generative capabilities of LLMs for more personalized +suggestions. While current research focuses on English due to its resource +richness, this work explores the impact of non-English prompts on +recommendation performance. Using OpenP5, a platform for developing and +evaluating LLM-based recommendations, we expanded its English prompt templates +to include Spanish and Turkish. Evaluation on three real-world datasets, namely +ML1M, LastFM, and Amazon-Beauty, showed that usage of non-English prompts +generally reduce performance, especially in less-resourced languages like +Turkish. We also retrained an LLM-based recommender model with multilingual +prompts to analyze performance variations. Retraining with multilingual prompts +resulted in more balanced performance across languages, but slightly reduced +English performance. This work highlights the need for diverse language support +in LLM-based recommenders and suggests future research on creating evaluation +datasets, using newer models and additional languages. + +
+
+
+
+
+ + ☆ Dot Product is All You Need: Bridging the Gap Between Item + Recommendation and Link Prediction + + +
+ Item recommendation (the task of predicting if a user may interact with new +items from the catalogue in a recommendation system) and link prediction (the +task of identifying missing links in a knowledge graph) have long been regarded +as distinct problems. In this work, we show that the item recommendation +problem can be seen as an instance of the link prediction problem, where +entities in the graph represent users and items, and the task consists of +predicting missing instances of the relation type <>. In a +preliminary attempt to demonstrate the assumption, we decide to test three +popular factorisation-based link prediction models on the item recommendation +task, showing that their predictive accuracy is competitive with ten +state-of-the-art recommendation models. The purpose is to show how the former +may be seamlessly and effectively applied to the recommendation task without +any specific modification to their architectures. Finally, while beginning to +unveil the key reasons behind the recommendation performance of the selected +link prediction models, we explore different settings for their hyper-parameter +values, paving the way for future directions. + +
+
+
+
+
+ + ☆ Hierarchical Reinforcement Learning for Temporal Abstraction of Listwise + Recommendation + + +
+ Modern listwise recommendation systems need to consider both long-term user +perceptions and short-term interest shifts. Reinforcement learning can be +applied on recommendation to study such a problem but is also subject to large +search space, sparse user feedback and long interactive latency. Motivated by +recent progress in hierarchical reinforcement learning, we propose a novel +framework called mccHRL to provide different levels of temporal abstraction on +listwise recommendation. Within the hierarchical framework, the high-level +agent studies the evolution of user perception, while the low-level agent +produces the item selection policy by modeling the process as a sequential +decision-making problem. We argue that such framework has a well-defined +decomposition of the outra-session context and the intra-session context, which +are encoded by the high-level and low-level agents, respectively. To verify +this argument, we implement both a simulator-based environment and an +industrial dataset-based experiment. Results observe significant performance +improvement by our method, compared with several well-known baselines. Data and +codes have been made public. + +
+
+ comment: 18 pages, 4 figures +
+
+
+
+
+ + ☆ Enhancing Sequential Music Recommendation with Negative + Feedback-informed Contrastive Learning + + +
+ Modern music streaming services are heavily based on recommendation engines +to serve content to users. Sequential recommendation -- continuously providing +new items within a single session in a contextually coherent manner -- has been +an emerging topic in current literature. User feedback -- a positive or +negative response to the item presented -- is used to drive content +recommendations by learning user preferences. We extend this idea to +session-based recommendation to provide context-coherent music recommendations +by modelling negative user feedback, i.e., skips, in the loss function. We +propose a sequence-aware contrastive sub-task to structure item embeddings in +session-based music recommendation, such that true next-positive items +(ignoring skipped items) are structured closer in the session embedding space, +while skipped tracks are structured farther away from all items in the session. +This directly affects item rankings using a K-nearest-neighbors search for +next-item recommendations, while also promoting the rank of the true next item. +Experiments incorporating this task into SoTA methods for sequential item +recommendation show consistent performance gains in terms of next-item hit +rate, item ranking, and skip down-ranking on three music recommendation +datasets, strongly benefiting from the increasing presence of user feedback. + +
+
+ comment: To-appear at 18th ACM Conference on Recommendation Systems +
+
+
+
+
+ + ☆ Diff-VPS: Video Polyp Segmentation via a Multi-task Diffusion Network + with Adversarial Temporal Reasoning + + +
+ Diffusion Probabilistic Models have recently attracted significant attention +in the community of computer vision due to their outstanding performance. +However, while a substantial amount of diffusion-based research has focused on +generative tasks, no work introduces diffusion models to advance the results of +polyp segmentation in videos, which is frequently challenged by polyps' high +camouflage and redundant temporal cues.In this paper, we present a novel +diffusion-based network for video polyp segmentation task, dubbed as Diff-VPS. +We incorporate multi-task supervision into diffusion models to promote the +discrimination of diffusion models on pixel-by-pixel segmentation. This +integrates the contextual high-level information achieved by the joint +classification and detection tasks. To explore the temporal dependency, +Temporal Reasoning Module (TRM) is devised via reasoning and reconstructing the +target frame from the previous frames. We further equip TRM with a generative +adversarial self-supervised strategy to produce more realistic frames and thus +capture better dynamic cues. Extensive experiments are conducted on SUN-SEG, +and the results indicate that our proposed Diff-VPS significantly achieves +state-of-the-art performance. Code is available at +https://github.com/lydia-yllu/Diff-VPS. + +
+
+
+
+
+ + ☆ Negative Sampling in Recommendation: A Survey and Future Directions + + +
+ Recommender systems aim to capture users' personalized preferences from the +cast amount of user behaviors, making them pivotal in the era of information +explosion. However, the presence of the dynamic preference, the "information +cocoons", and the inherent feedback loops in recommendation make users interact +with a limited number of items. Conventional recommendation algorithms +typically focus on the positive historical behaviors, while neglecting the +essential role of negative feedback in user interest understanding. As a +promising but easy-to-ignored area, negative sampling is proficients in +revealing the genuine negative aspect inherent in user behaviors, emerging as +an inescapable procedure in recommendation. In this survey, we first discuss +the role of negative sampling in recommendation and thoroughly analyze +challenges that consistently impede its progress. Then, we conduct an extensive +literature review on the existing negative sampling strategies in +recommendation and classify them into five categories with their discrepant +techniques. Finally, we detail the insights of the tailored negative sampling +strategies in diverse recommendation scenarios and outline an overview of the +prospective research directions toward which the community may engage and +benefit. + +
+
+ comment: 38 pages, 9 figures; Under review +
+
+
+
+
+ + ☆ E-commerce Webpage Recommendation Scheme Base on Semantic Mining and + Neural Networks + + +
+ In e-commerce websites, web mining web page recommendation technology has +been widely used. However, recommendation solutions often cannot meet the +actual application needs of online shopping users. To address this problem, +this paper proposes an e-commerce web page recommendation solution that +combines semantic web mining and BP neural networks. First, the web logs of +user searches are processed, and 5 features are extracted: content priority, +time consumption priority, online shopping users' explicit/implicit feedback on +the website, recommendation semantics and input deviation amount. Then, these +features are used as input features of the BP neural network to classify and +identify the priority of the final output web page. Finally, the web pages are +sorted according to priority and recommended to users. This project uses book +sales webpages as samples for experiments. The results show that this solution +can quickly and accurately identify the webpages required by users. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2409.01137 +
+
+
+
+
+ + ♻ ☆ Smart E-commerce Recommendations with Semantic AI + + +
+ In e-commerce, web mining for page recommendations is widely used but often +fails to meet user needs. To address this, we propose a novel solution +combining semantic web mining with BP neural networks. We process user search +logs to extract five key features: content priority, time spent, user feedback, +recommendation semantics, and input deviation. These features are then fed into +a BP neural network to classify and prioritize web pages. The prioritized pages +are recommended to users. Using book sales pages for testing, our results +demonstrate that this solution can quickly and accurately identify the pages +users need. Our approach ensures that recommendations are more relevant and +tailored to individual preferences, enhancing the online shopping experience. +By leveraging advanced semantic analysis and neural network techniques, we +bridge the gap between user expectations and actual recommendations. This +innovative method not only improves accuracy but also speeds up the +recommendation process, making it a valuable tool for e-commerce platforms +aiming to boost user satisfaction and engagement. Additionally, our system +ability to handle large datasets and provide real-time recommendations makes it +a scalable and efficient solution for modern e-commerce challenges. + +
+
+ comment: My paper contain some errors +
+
+
+
+
+ + ♻ ☆ Explainable Identification of Hate Speech towards Islam using Graph + Neural Networks NeurIPS 2023 + + +
+ Islamophobic language on online platforms fosters intolerance, making +detection and elimination crucial for promoting harmony. Traditional hate +speech detection models rely on NLP techniques like tokenization, +part-of-speech tagging, and encoder-decoder models. However, Graph Neural +Networks (GNNs), with their ability to utilize relationships between data +points, offer more effective detection and greater explainability. In this +work, we represent speeches as nodes and connect them with edges based on their +context and similarity to develop the graph. This study introduces a novel +paradigm using GNNs to identify and explain hate speech towards Islam. Our +model leverages GNNs to understand the context and patterns of hate speech by +connecting texts via pretrained NLP-generated word embeddings, achieving +state-of-the-art performance and enhancing detection accuracy while providing +valuable explanations. This highlights the potential of GNNs in combating +online hate speech and fostering a safer, more inclusive online environment. + +
+
+ comment: Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (non-archival) + (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP + 2024 : NLP for Positive Impact Workshop (archival) +
+
+
+
+
+ + ♻ ☆ An Empirical Study of Training ID-Agnostic Multi-modal Sequential + Recommenders + + +
+ Sequential Recommendation (SR) aims to predict future user-item interactions +based on historical interactions. While many SR approaches concentrate on user +IDs and item IDs, the human perception of the world through multi-modal +signals, like text and images, has inspired researchers to delve into +constructing SR from multi-modal information without using IDs. However, the +complexity of multi-modal learning manifests in diverse feature extractors, +fusion methods, and pre-trained models. Consequently, designing a simple and +universal \textbf{M}ulti-\textbf{M}odal \textbf{S}equential +\textbf{R}ecommendation (\textbf{MMSR}) framework remains a formidable +challenge. We systematically summarize the existing multi-modal related SR +methods and distill the essence into four core components: visual encoder, text +encoder, multimodal fusion module, and sequential architecture. Along these +dimensions, we dissect the model designs, and answer the following +sub-questions: First, we explore how to construct MMSR from scratch, ensuring +its performance either on par with or exceeds existing SR methods without +complex techniques. Second, we examine if MMSR can benefit from existing +multi-modal pre-training paradigms. Third, we assess MMSR's capability in +tackling common challenges like cold start and domain transferring. Our +experiment results across four real-world recommendation scenarios demonstrate +the great potential ID-agnostic multi-modal sequential recommendation. Our +framework can be found at: https://github.com/MMSR23/MMSR. + +
+
+ comment: We are requesting to withdraw the paper due to a significant + methodological error discovered in the experimental setup, specifically in + Section 4.3. This error affects the validity of the results and conclusions + drawn from the study. We intend to address these issues and submit a + corrected version in the future +
+
+
+
+
+ + ♻ ☆ Rs4rs: Semantically Find Recent Publications from Top Recommendation + System-Related Venues RecSys 2024 + + +
+ Rs4rs is a web application designed to perform semantic search on recent +papers from top conferences and journals related to Recommender Systems. +Current scholarly search engine tools like Google Scholar, Semantic Scholar, +and ResearchGate often yield broad results that fail to target the most +relevant high-quality publications. Moreover, manually visiting individual +conference and journal websites is a time-consuming process that primarily +supports only syntactic searches. Rs4rs addresses these issues by providing a +user-friendly platform where researchers can input their topic of interest and +receive a list of recent, relevant papers from top Recommender Systems venues. +Utilizing semantic search techniques, Rs4rs ensures that the search results are +not only precise and relevant but also comprehensive, capturing papers +regardless of variations in wording. This tool significantly enhances research +efficiency and accuracy, thereby benefitting the research community and public +by facilitating access to high-quality, pertinent academic resources in the +field of Recommender Systems. Rs4rs is available at https://rs4rs.com. + +
+
+ comment: Accepted in ACM RecSys 2024 +
+
+
+
+
+ + ♻ ☆ A Normative Framework for Benchmarking Consumer Fairness in Large + Language Model Recommender System + + +
+ The rapid adoption of large language models (LLMs) in recommender systems +(RS) presents new challenges in understanding and evaluating their biases, +which can result in unfairness or the amplification of stereotypes. Traditional +fairness evaluations in RS primarily focus on collaborative filtering (CF) +settings, which may not fully capture the complexities of LLMs, as these models +often inherit biases from large, unregulated data. This paper proposes a +normative framework to benchmark consumer fairness in LLM-powered recommender +systems (RecLLMs). + We critically examine how fairness norms in classical RS fall short in +addressing the challenges posed by LLMs. We argue that this gap can lead to +arbitrary conclusions about fairness, and we propose a more structured, formal +approach to evaluate fairness in such systems. Our experiments on the MovieLens +dataset on consumer fairness, using in-context learning (zero-shot vs. +few-shot) reveal fairness deviations in age-based recommendations, particularly +when additional contextual examples are introduced (ICL-2). Statistical +significance tests confirm that these deviations are not random, highlighting +the need for robust evaluation methods. While this work offers a preliminary +discussion on a proposed normative framework, our hope is that it could provide +a formal, principled approach for auditing and mitigating bias in RecLLMs. The +code and dataset used for this work will be shared at "gihub-anonymized". + +
+
+
+
+
+ + ♻ ☆ CF-KAN: Kolmogorov-Arnold Network-based Collaborative Filtering to + Mitigate Catastrophic Forgetting in Recommender Systems + + +
+ Collaborative filtering (CF) remains essential in recommender systems, +leveraging user--item interactions to provide personalized recommendations. +Meanwhile, a number of CF techniques have evolved into sophisticated model +architectures based on multi-layer perceptrons (MLPs). However, MLPs often +suffer from catastrophic forgetting, and thus lose previously acquired +knowledge when new information is learned, particularly in dynamic environments +requiring continual learning. To tackle this problem, we propose CF-KAN, a new +CF method utilizing Kolmogorov-Arnold networks (KANs). By learning nonlinear +functions on the edge level, KANs are more robust to the catastrophic +forgetting problem than MLPs. Built upon a KAN-based autoencoder, CF-KAN is +designed in the sense of effectively capturing the intricacies of sparse +user--item interactions and retaining information from previous data instances. +Despite its simplicity, our extensive experiments demonstrate 1) CF-KAN's +superiority over state-of-the-art methods in recommendation accuracy, 2) +CF-KAN's resilience to catastrophic forgetting, underscoring its effectiveness +in both static and dynamic recommendation scenarios, and 3) CF-KAN's edge-level +interpretation facilitating the explainability of recommendations. + +
+
+ comment: 9 pages, 7 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ A Survey of Multimodal Composite Editing and Retrieval + + +
+ In the real world, where information is abundant and diverse across different +modalities, understanding and utilizing various data types to improve retrieval +systems is a key focus of research. Multimodal composite retrieval integrates +diverse modalities such as text, image and audio, etc. to provide more +accurate, personalized, and contextually relevant results. To facilitate a +deeper understanding of this promising direction, this survey explores +multimodal composite editing and retrieval in depth, covering image-text +composite editing, image-text composite retrieval, and other multimodal +composite retrieval. In this survey, we systematically organize the application +scenarios, methods, benchmarks, experiments, and future directions. Multimodal +learning is a hot topic in large model era, and have also witnessed some +surveys in multimodal learning and vision-language models with transformers +published in the PAMI journal. To the best of our knowledge, this survey is the +first comprehensive review of the literature on multimodal composite retrieval, +which is a timely complement of multimodal fusion to existing reviews. To help +readers' quickly track this field, we build the project page for this survey, +which can be found at +https://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval. + +
+
+ comment: 20 pages, 3 figures, and 11 tables +
+
+
+
+
+ + ♻ ☆ Dataset Regeneration for Sequential Recommendation + + +
+ The sequential recommender (SR) system is a crucial component of modern +recommender systems, as it aims to capture the evolving preferences of users. +Significant efforts have been made to enhance the capabilities of SR systems. +These methods typically follow the model-centric paradigm, which involves +developing effective models based on fixed datasets. However, this approach +often overlooks potential quality issues and flaws inherent in the data. Driven +by the potential of data-centric AI, we propose a novel data-centric paradigm +for developing an ideal training dataset using a model-agnostic dataset +regeneration framework called DR4SR. This framework enables the regeneration of +a dataset with exceptional cross-architecture generalizability. Additionally, +we introduce the DR4SR+ framework, which incorporates a model-aware dataset +personalizer to tailor the regenerated dataset specifically for a target model. +To demonstrate the effectiveness of the data-centric paradigm, we integrate our +framework with various model-centric methods and observe significant +performance improvements across four widely adopted datasets. Furthermore, we +conduct in-depth analyses to explore the potential of the data-centric paradigm +and provide valuable insights. The code can be found at +https://github.com/USTC-StarTeam/DR4SR. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ DreamMesh: Jointly Manipulating and Texturing Triangle Meshes for + Text-to-3D Generation ECCV 2024 + + +
+ Learning radiance fields (NeRF) with powerful 2D diffusion models has +garnered popularity for text-to-3D generation. Nevertheless, the implicit 3D +representations of NeRF lack explicit modeling of meshes and textures over +surfaces, and such surface-undefined way may suffer from the issues, e.g., +noisy surfaces with ambiguous texture details or cross-view inconsistency. To +alleviate this, we present DreamMesh, a novel text-to-3D architecture that +pivots on well-defined surfaces (triangle meshes) to generate high-fidelity +explicit 3D model. Technically, DreamMesh capitalizes on a distinctive +coarse-to-fine scheme. In the coarse stage, the mesh is first deformed by +text-guided Jacobians and then DreamMesh textures the mesh with an interlaced +use of 2D diffusion models in a tuning free manner from multiple viewpoints. In +the fine stage, DreamMesh jointly manipulates the mesh and refines the texture +map, leading to high-quality triangle meshes with high-fidelity textured +materials. Extensive experiments demonstrate that DreamMesh significantly +outperforms state-of-the-art text-to-3D methods in faithfully generating 3D +content with richer textual details and enhanced geometry. Our project page is +available at https://dreammesh.github.io. + +
+
+ comment: ECCV 2024. Project page is available at + \url{https://dreammesh.github.io} +
+
+
+
+
+ + ☆ Hi3D: Pursuing High-Resolution Image-to-3D Generation with Video + Diffusion Models + + +
+ Despite having tremendous progress in image-to-3D generation, existing +methods still struggle to produce multi-view consistent images with +high-resolution textures in detail, especially in the paradigm of 2D diffusion +that lacks 3D awareness. In this work, we present High-resolution Image-to-3D +model (Hi3D), a new video diffusion based paradigm that redefines a single +image to multi-view images as 3D-aware sequential image generation (i.e., +orbital video generation). This methodology delves into the underlying temporal +consistency knowledge in video diffusion model that generalizes well to +geometry consistency across multiple views in 3D generation. Technically, Hi3D +first empowers the pre-trained video diffusion model with 3D-aware prior +(camera pose condition), yielding multi-view images with low-resolution texture +details. A 3D-aware video-to-video refiner is learnt to further scale up the +multi-view images with high-resolution texture details. Such high-resolution +multi-view images are further augmented with novel views through 3D Gaussian +Splatting, which are finally leveraged to obtain high-fidelity meshes via 3D +reconstruction. Extensive experiments on both novel view synthesis and single +view reconstruction demonstrate that our Hi3D manages to produce superior +multi-view consistency images with highly-detailed textures. Source code and +data are available at \url{https://github.com/yanghb22-fdu/Hi3D-Official}. + +
+
+ comment: ACM Multimedia 2024. Source code is available at + \url{https://github.com/yanghb22-fdu/Hi3D-Official} +
+
+
+
+
+ + ☆ FreeEnhance: Tuning-Free Image Enhancement via Content-Consistent + Noising-and-Denoising Process + + +
+ The emergence of text-to-image generation models has led to the recognition +that image enhancement, performed as post-processing, would significantly +improve the visual quality of the generated images. Exploring diffusion models +to enhance the generated images nevertheless is not trivial and necessitates to +delicately enrich plentiful details while preserving the visual appearance of +key content in the original image. In this paper, we propose a novel framework, +namely FreeEnhance, for content-consistent image enhancement using the +off-the-shelf image diffusion models. Technically, FreeEnhance is a two-stage +process that firstly adds random noise to the input image and then capitalizes +on a pre-trained image diffusion model (i.e., Latent Diffusion Models) to +denoise and enhance the image details. In the noising stage, FreeEnhance is +devised to add lighter noise to the region with higher frequency to preserve +the high-frequent patterns (e.g., edge, corner) in the original image. In the +denoising stage, we present three target properties as constraints to +regularize the predicted noise, enhancing images with high acutance and high +visual quality. Extensive experiments conducted on the HPDv2 dataset +demonstrate that our FreeEnhance outperforms the state-of-the-art image +enhancement models in terms of quantitative metrics and human preference. More +remarkably, FreeEnhance also shows higher human preference compared to the +commercial image enhancement solution of Magnific AI. + +
+
+ comment: ACM Multimedia 2024 +
+
+
+
+
+ + ☆ VMAS: Video-to-Music Generation via Semantic Alignment in Web Music + Videos + + +
+ We present a framework for learning to generate background music from video +inputs. Unlike existing works that rely on symbolic musical annotations, which +are limited in quantity and diversity, our method leverages large-scale web +videos accompanied by background music. This enables our model to learn to +generate realistic and diverse music. To accomplish this goal, we develop a +generative video-music Transformer with a novel semantic video-music alignment +scheme. Our model uses a joint autoregressive and contrastive learning +objective, which encourages the generation of music aligned with high-level +video content. We also introduce a novel video-beat alignment scheme to match +the generated music beats with the low-level motions in the video. Lastly, to +capture fine-grained visual cues in a video needed for realistic background +music generation, we introduce a new temporal video encoder architecture, +allowing us to efficiently process videos consisting of many densely sampled +frames. We train our framework on our newly curated DISCO-MV dataset, +consisting of 2.2M video-music samples, which is orders of magnitude larger +than any prior datasets used for video music generation. Our method outperforms +existing approaches on the DISCO-MV and MusicCaps datasets according to various +music generation evaluation metrics, including human evaluation. Results are +available at https://genjib.github.io/project_page/VMAs/index.html + +
+
+ comment: Project Page: https://genjib.github.io/project_page/VMAs/index.html +
+
+
+
+
+ + ♻ ☆ CustomContrast: A Multilevel Contrastive Perspective For Subject-Driven + Text-to-Image Customization + + +
+ Subject-driven text-to-image (T2I) customization has drawn significant +interest in academia and industry. This task enables pre-trained models to +generate novel images based on unique subjects. Existing studies adopt a +self-reconstructive perspective, focusing on capturing all details of a single +image, which will misconstrue the specific image's irrelevant attributes (e.g., +view, pose, and background) as the subject intrinsic attributes. This +misconstruction leads to both overfitting or underfitting of irrelevant and +intrinsic attributes of the subject, i.e., these attributes are +over-represented or under-represented simultaneously, causing a trade-off +between similarity and controllability. In this study, we argue an ideal +subject representation can be achieved by a cross-differential perspective, +i.e., decoupling subject intrinsic attributes from irrelevant attributes via +contrastive learning, which allows the model to focus more on intrinsic +attributes through intra-consistency (features of the same subject are +spatially closer) and inter-distinctiveness (features of different subjects +have distinguished differences). Specifically, we propose CustomContrast, a +novel framework, which includes a Multilevel Contrastive Learning (MCL) +paradigm and a Multimodal Feature Injection (MFI) Encoder. The MCL paradigm is +used to extract intrinsic features of subjects from high-level semantics to +low-level appearance through crossmodal semantic contrastive learning and +multiscale appearance contrastive learning. To facilitate contrastive learning, +we introduce the MFI encoder to capture cross-modal representations. Extensive +experiments show the effectiveness of CustomContrast in subject similarity and +text controllability. + +
+
+
+
+
+ + ♻ ☆ Attack on Scene Flow using Point Clouds + + +
+ Deep neural networks have made significant advancements in accurately +estimating scene flow using point clouds, which is vital for many applications +like video analysis, action recognition, and navigation. The robustness of +these techniques, however, remains a concern, particularly in the face of +adversarial attacks that have been proven to deceive state-of-the-art deep +neural networks in many domains. Surprisingly, the robustness of scene flow +networks against such attacks has not been thoroughly investigated. To address +this problem, the proposed approach aims to bridge this gap by introducing +adversarial white-box attacks specifically tailored for scene flow networks. +Experimental results show that the generated adversarial examples obtain up to +33.7 relative degradation in average end-point error on the KITTI and +FlyingThings3D datasets. The study also reveals the significant impact that +attacks targeting point clouds in only one dimension or color channel have on +average end-point error. Analyzing the success and failure of these attacks on +the scene flow networks and their 2D optical flow network variants shows a +higher vulnerability for the optical flow networks. Code is available at +https://github.com/aheldis/Attack-on-Scene-Flow-using-Point-Clouds.git. + +
+
+
+
+
+ + ♻ ☆ Looking Backward: Streaming Video-to-Video Translation with Feature + Banks + + +
+ This paper introduces StreamV2V, a diffusion model that achieves real-time +streaming video-to-video (V2V) translation with user prompts. Unlike prior V2V +methods using batches to process limited frames, we opt to process frames in a +streaming fashion, to support unlimited frames. At the heart of StreamV2V lies +a backward-looking principle that relates the present to the past. This is +realized by maintaining a feature bank, which archives information from past +frames. For incoming frames, StreamV2V extends self-attention to include banked +keys and values and directly fuses similar past features into the output. The +feature bank is continually updated by merging stored and new features, making +it compact but informative. StreamV2V stands out for its adaptability and +efficiency, seamlessly integrating with image diffusion models without +fine-tuning. It can run 20 FPS on one A100 GPU, being 15x, 46x, 108x, and 158x +faster than FlowVid, CoDeF, Rerender, and TokenFlow, respectively. Quantitative +metrics and user studies confirm StreamV2V's exceptional ability to maintain +temporal consistency. + +
+
+ comment: Project page: https://jeff-liangf.github.io/projects/streamv2v +
+
+
+
+
+ + ♻ ☆ A Survey of Multimodal Composite Editing and Retrieval + + +
+ In the real world, where information is abundant and diverse across different +modalities, understanding and utilizing various data types to improve retrieval +systems is a key focus of research. Multimodal composite retrieval integrates +diverse modalities such as text, image and audio, etc. to provide more +accurate, personalized, and contextually relevant results. To facilitate a +deeper understanding of this promising direction, this survey explores +multimodal composite editing and retrieval in depth, covering image-text +composite editing, image-text composite retrieval, and other multimodal +composite retrieval. In this survey, we systematically organize the application +scenarios, methods, benchmarks, experiments, and future directions. Multimodal +learning is a hot topic in large model era, and have also witnessed some +surveys in multimodal learning and vision-language models with transformers +published in the PAMI journal. To the best of our knowledge, this survey is the +first comprehensive review of the literature on multimodal composite retrieval, +which is a timely complement of multimodal fusion to existing reviews. To help +readers' quickly track this field, we build the project page for this survey, +which can be found at +https://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval. + +
+
+ comment: 20 pages, 3 figures, and 11 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Interactive Counterfactual Exploration of Algorithmic Harms in + Recommender Systems + + +
+ Recommender systems have become integral to digital experiences, shaping user +interactions and preferences across various platforms. Despite their widespread +use, these systems often suffer from algorithmic biases that can lead to unfair +and unsatisfactory user experiences. This study introduces an interactive tool +designed to help users comprehend and explore the impacts of algorithmic harms +in recommender systems. By leveraging visualizations, counterfactual +explanations, and interactive modules, the tool allows users to investigate how +biases such as miscalibration, stereotypes, and filter bubbles affect their +recommendations. Informed by in-depth user interviews, this tool benefits both +general users and researchers by increasing transparency and offering +personalized impact assessments, ultimately fostering a better understanding of +algorithmic biases and contributing to more equitable recommendation outcomes. +This work provides valuable insights for future research and practical +applications in mitigating bias and enhancing fairness in machine learning +algorithms. + +
+
+
+
+
+ + ☆ Adversarial Attacks to Multi-Modal Models + + +
+ Multi-modal models have gained significant attention due to their powerful +capabilities. These models effectively align embeddings across diverse data +modalities, showcasing superior performance in downstream tasks compared to +their unimodal counterparts. Recent study showed that the attacker can +manipulate an image or audio file by altering it in such a way that its +embedding matches that of an attacker-chosen targeted input, thereby deceiving +downstream models. However, this method often underperforms due to inherent +disparities in data from different modalities. In this paper, we introduce +CrossFire, an innovative approach to attack multi-modal models. CrossFire +begins by transforming the targeted input chosen by the attacker into a format +that matches the modality of the original image or audio file. We then +formulate our attack as an optimization problem, aiming to minimize the angular +deviation between the embeddings of the transformed input and the modified +image or audio file. Solving this problem determines the perturbations to be +added to the original media. Our extensive experiments on six real-world +benchmark datasets reveal that CrossFire can significantly manipulate +downstream tasks, surpassing existing attacks. Additionally, we evaluate six +defensive strategies against CrossFire, finding that current defenses are +insufficient to counteract our CrossFire. + +
+
+ comment: To appear in the ACM Workshop on Large AI Systems and Models with + Privacy and Safety Analysis 2024 (LAMPS '24) +
+
+
+
+
+ + ☆ Critical Features Tracking on Triangulated Irregular Networks by a + Scale-Space Method SP + + +
+ The scale-space method is a well-established framework that constructs a +hierarchical representation of an input signal and facilitates coarse-to-fine +visual reasoning. Considering the terrain elevation function as the input +signal, the scale-space method can identify and track significant topographic +features across different scales. The number of scales a feature persists, +called its life span, indicates the importance of that feature. In this way, +important topographic features of a landscape can be selected, which are useful +for many applications, including cartography, nautical charting, and land-use +planning. The scale-space methods developed for terrain data use gridded +Digital Elevation Models (DEMs) to represent the terrain. However, gridded DEMs +lack the flexibility to adapt to the irregular distribution of input data and +the varied topological complexity of different regions. Instead, Triangulated +Irregular Networks (TINs) can be directly generated from irregularly +distributed point clouds and accurately preserve important features. In this +work, we introduce a novel scale-space analysis pipeline for TINs, addressing +the multiple challenges in extending grid-based scale-space methods to TINs. +Our pipeline can efficiently identify and track topologically important +features on TINs. Moreover, it is capable of analyzing terrains with irregular +boundaries, which poses challenges for grid-based methods. Comprehensive +experiments show that, compared to grid-based methods, our TIN-based pipeline +is more efficient, accurate, and has better resolution robustness. + +
+
+ comment: 13pages, ACM SIGSPATIAL 2024 +
+
+
+
+
+ + ☆ DV-FSR: A Dual-View Target Attack Framework for Federated Sequential + Recommendation + + +
+ Federated recommendation (FedRec) preserves user privacy by enabling +decentralized training of personalized models, but this architecture is +inherently vulnerable to adversarial attacks. Significant research has been +conducted on targeted attacks in FedRec systems, motivated by commercial and +social influence considerations. However, much of this work has largely +overlooked the differential robustness of recommendation models. Moreover, our +empirical findings indicate that existing targeted attack methods achieve only +limited effectiveness in Federated Sequential Recommendation (FSR) tasks. +Driven by these observations, we focus on investigating targeted attacks in FSR +and propose a novel dualview attack framework, named DV-FSR. This attack method +uniquely combines a sampling-based explicit strategy with a contrastive +learning-based implicit gradient strategy to orchestrate a coordinated attack. +Additionally, we introduce a specific defense mechanism tailored for targeted +attacks in FSR, aiming to evaluate the mitigation effects of the attack method +we proposed. Extensive experiments validate the effectiveness of our proposed +approach on representative sequential models. + +
+
+
+
+
+ + ☆ Operational Advice for Dense and Sparse Retrievers: HNSW, Flat, or + Inverted Indexes? + + +
+ Practitioners working on dense retrieval today face a bewildering number of +choices. Beyond selecting the embedding model, another consequential choice is +the actual implementation of nearest-neighbor vector search. While best +practices recommend HNSW indexes, flat vector indexes with brute-force search +represent another viable option, particularly for smaller corpora and for rapid +prototyping. In this paper, we provide experimental results on the BEIR dataset +using the open-source Lucene search library that explicate the tradeoffs +between HNSW and flat indexes (including quantized variants) from the +perspectives of indexing time, query evaluation performance, and retrieval +quality. With additional comparisons between dense and sparse retrievers, our +results provide guidance for today's search practitioner in understanding the +design space of dense and sparse retrievers. To our knowledge, we are the first +to provide operational advice supported by empirical experiments in this +regard. + +
+
+
+
+
+ + ☆ Enhancing Sequential Recommendations through Multi-Perspective + Reflections and Iteration + + +
+ Sequence recommendation (SeqRec) aims to predict the next item a user will +interact with by understanding user intentions and leveraging collaborative +filtering information. Large language models (LLMs) have shown great promise in +recommendation tasks through prompt-based, fixed reflection libraries, and +fine-tuning techniques. However, these methods face challenges, including lack +of supervision, inability to optimize reflection sources, inflexibility to +diverse user needs, and high computational costs. Despite promising results, +current studies primarily focus on reflections of users' explicit preferences +(e.g., item titles) while neglecting implicit preferences (e.g., brands) and +collaborative filtering information. This oversight hinders the capture of +preference shifts and dynamic user behaviors. Additionally, existing approaches +lack mechanisms for reflection evaluation and iteration, often leading to +suboptimal recommendations. To address these issues, we propose the Mixture of +REflectors (MoRE) framework, designed to model and learn dynamic user +preferences in SeqRec. Specifically, MoRE introduces three reflectors for +generating LLM-based reflections on explicit preferences, implicit preferences, +and collaborative signals. Each reflector incorporates a self-improving +strategy, termed refining-and-iteration, to evaluate and iteratively update +reflections. Furthermore, a meta-reflector employs a contextual bandit +algorithm to select the most suitable expert and corresponding reflections for +each user's recommendation, effectively capturing dynamic preferences. +Extensive experiments on three real-world datasets demonstrate that MoRE +consistently outperforms state-of-the-art methods, requiring less training time +and GPU memory compared to other LLM-based approaches in SeqRec. + +
+
+ comment: First 3 authors contributes equally to this work +
+
+
+
+
+ + ☆ User Preferences for Large Language Model versus Template-Based + Explanations of Movie Recommendations: A Pilot Study + + +
+ Recommender systems have become integral to our digital experiences, from +online shopping to streaming platforms. Still, the rationale behind their +suggestions often remains opaque to users. While some systems employ a +graph-based approach, offering inherent explainability through paths +associating recommended items and seed items, non-experts could not easily +understand these explanations. A popular alternative is to convert graph-based +explanations into textual ones using a template and an algorithm, which we +denote here as ''template-based'' explanations. Yet, these can sometimes come +across as impersonal or uninspiring. A novel method would be to employ large +language models (LLMs) for this purpose, which we denote as ''LLM-based''. To +assess the effectiveness of LLMs in generating more resonant explanations, we +conducted a pilot study with 25 participants. They were presented with three +explanations: (1) traditional template-based, (2) LLM-based rephrasing of the +template output, and (3) purely LLM-based explanations derived from the +graph-based explanations. Although subject to high variance, preliminary +findings suggest that LLM-based explanations may provide a richer and more +engaging user experience, further aligning with user expectations. This study +sheds light on the potential limitations of current explanation methods and +offers promising directions for leveraging large language models to improve +user satisfaction and trust in recommender systems. + +
+
+ comment: Presented to the Dutch-Belgian Workshop on Recommender Systems 2023 + (14-15 December, 2023 - Antwerp, Belgium) +
+
+
+
+
+ + ☆ NLP-Powered Repository and Search Engine for Academic Papers: A Case + Study on Cyber Risk Literature with CyLit + + +
+ As the body of academic literature continues to grow, researchers face +increasing difficulties in effectively searching for relevant resources. +Existing databases and search engines often fall short of providing a +comprehensive and contextually relevant collection of academic literature. To +address this issue, we propose a novel framework that leverages Natural +Language Processing (NLP) techniques. This framework automates the retrieval, +summarization, and clustering of academic literature within a specific research +domain. To demonstrate the effectiveness of our approach, we introduce CyLit, +an NLP-powered repository specifically designed for the cyber risk literature. +CyLit empowers researchers by providing access to context-specific resources +and enabling the tracking of trends in the dynamic and rapidly evolving field +of cyber risk. Through the automatic processing of large volumes of data, our +NLP-powered solution significantly enhances the efficiency and specificity of +academic literature searches. We compare the literature categorization results +of CyLit to those presented in survey papers or generated by ChatGPT, +highlighting the distinctive insights this tool provides into cyber risk +research literature. Using NLP techniques, we aim to revolutionize the way +researchers discover, analyze, and utilize academic resources, ultimately +fostering advancements in various domains of knowledge. + +
+
+
+
+
+ + ☆ HierLLM: Hierarchical Large Language Model for Question Recommendation + + +
+ Question recommendation is a task that sequentially recommends questions for +students to enhance their learning efficiency. That is, given the learning +history and learning target of a student, a question recommender is supposed to +select the question that will bring the most improvement for students. Previous +methods typically model the question recommendation as a sequential +decision-making problem, estimating students' learning state with the learning +history, and feeding the learning state with the learning target to a neural +network to select the recommended question from a question set. However, +previous methods are faced with two challenges: (1) learning history is +unavailable in the cold start scenario, which makes the recommender generate +inappropriate recommendations; (2) the size of the question set is much large, +which makes it difficult for the recommender to select the best question +precisely. To address the challenges, we propose a method called hierarchical +large language model for question recommendation (HierLLM), which is a +LLM-based hierarchical structure. The LLM-based structure enables HierLLM to +tackle the cold start issue with the strong reasoning abilities of LLM. The +hierarchical structure takes advantage of the fact that the number of concepts +is significantly smaller than the number of questions, narrowing the range of +selectable questions by first identifying the relevant concept for the +to-recommend question, and then selecting the recommended question based on +that concept. This hierarchical structure reduces the difficulty of the +recommendation.To investigate the performance of HierLLM, we conduct extensive +experiments, and the results demonstrate the outstanding performance of +HierLLM. + +
+
+
+
+
+ + ☆ What makes a good concept anyway ? + + +
+ A good medical ontology is expected to cover its domain completely and +correctly. On the other hand, large ontologies are hard to build, hard to +understand, and hard to maintain. Thus, adding new concepts (often multi-word +concepts) to an existing ontology must be done judiciously. Only "good" +concepts should be added; however, it is difficult to define what makes a +concept good. In this research, we propose a metric to measure the goodness of +a concept. We identified factors that appear to influence goodness judgments of +medical experts and combined them into a single metric. These factors include +concept name length (in words), concept occurrence frequency in the medical +literature, and syntactic categories of component words. As an added factor, we +used the simplicity of a term after mapping it into a specific foreign +language. We performed Bayesian optimization of factor weights to achieve +maximum agreement between the metric and three medical experts. The results +showed that our metric had a 50.67% overall agreement with the experts, as +measured by Krippendorff's alpha. + +
+
+
+
+
+ + ♻ ☆ QueryBuilder: Human-in-the-Loop Query Development for Information + Retrieval + + +
+ Frequently, users of an Information Retrieval (IR) system start with an +overarching information need (a.k.a., an analytic task) and proceed to define +finer-grained queries covering various important aspects (i.e., sub-topics) of +that analytic task. We present a novel, interactive system called +$\textit{QueryBuilder}$, which allows a novice, English-speaking user to create +queries with a small amount of effort, through efficient exploration of an +English development corpus in order to rapidly develop cross-lingual +information retrieval queries corresponding to the user's information needs. +QueryBuilder performs near real-time retrieval of documents based on +user-entered search terms; the user looks through the retrieved documents and +marks sentences as relevant to the information needed. The marked sentences are +used by the system as additional information in query formation and refinement: +query terms (and, optionally, event features, which capture event $'triggers'$ +(indicator terms) and agent/patient roles) are appropriately weighted, and a +neural-based system, which better captures textual meaning, retrieves other +relevant content. The process of retrieval and marking is repeated as many +times as desired, giving rise to increasingly refined queries in each +iteration. The final product is a fine-grained query used in Cross-Lingual +Information Retrieval (CLIR). Our experiments using analytic tasks and requests +from the IARPA BETTER IR datasets show that with a small amount of effort (at +most 10 minutes per sub-topic), novice users can form $\textit{useful}$ +fine-grained queries including in languages they don't understand. QueryBuilder +also provides beneficial capabilities to the traditional corpus exploration and +query formation process. A demonstration video is released at +https://vimeo.com/734795835 + +
+
+
+
+
+ + ♻ ☆ RBoard: A Unified Platform for Reproducible and Reusable Recommender + System Benchmarks + + +
+ Recommender systems research lacks standardized benchmarks for +reproducibility and algorithm comparisons. We introduce RBoard, a novel +framework addressing these challenges by providing a comprehensive platform for +benchmarking diverse recommendation tasks, including CTR prediction, Top-N +recommendation, and others. RBoard's primary objective is to enable fully +reproducible and reusable experiments across these scenarios. The framework +evaluates algorithms across multiple datasets within each task, aggregating +results for a holistic performance assessment. It implements standardized +evaluation protocols, ensuring consistency and comparability. To facilitate +reproducibility, all user-provided code can be easily downloaded and executed, +allowing researchers to reliably replicate studies and build upon previous +work. By offering a unified platform for rigorous, reproducible evaluation +across various recommendation scenarios, RBoard aims to accelerate progress in +the field and establish a new standard for recommender systems benchmarking in +both academia and industry. The platform is available at https://rboard.org and +the demo video can be found at https://bit.ly/rboard-demo. + +
+
+
+
+
+ + ♻ ☆ Graph Retrieval-Augmented Generation: A Survey + + +
+ Recently, Retrieval-Augmented Generation (RAG) has achieved remarkable +success in addressing the challenges of Large Language Models (LLMs) without +necessitating retraining. By referencing an external knowledge base, RAG +refines LLM outputs, effectively mitigating issues such as ``hallucination'', +lack of domain-specific knowledge, and outdated information. However, the +complex structure of relationships among different entities in databases +presents challenges for RAG systems. In response, GraphRAG leverages structural +information across entities to enable more precise and comprehensive retrieval, +capturing relational knowledge and facilitating more accurate, context-aware +responses. Given the novelty and potential of GraphRAG, a systematic review of +current technologies is imperative. This paper provides the first comprehensive +overview of GraphRAG methodologies. We formalize the GraphRAG workflow, +encompassing Graph-Based Indexing, Graph-Guided Retrieval, and Graph-Enhanced +Generation. We then outline the core technologies and training methods at each +stage. Additionally, we examine downstream tasks, application domains, +evaluation methodologies, and industrial use cases of GraphRAG. Finally, we +explore future research directions to inspire further inquiries and advance +progress in the field. In order to track recent progress in this field, we set +up a repository at \url{https://github.com/pengboci/GraphRAG-Survey}. + +
+
+ comment: Ongoing work. Compared to the first version, several references have + been added and a GitHub repository link has been provided +
+
+
+
+
+ + ♻ ☆ Multi-Margin Cosine Loss: Proposal and Application in Recommender + Systems + + +
+ Recommender systems guide users through vast amounts of information by +suggesting items based on their predicted preferences. Collaborative +filtering-based deep learning techniques have regained popularity due to their +straightforward nature, relying only on user-item interactions. Typically, +these systems consist of three main components: an interaction module, a loss +function, and a negative sampling strategy. Initially, researchers focused on +enhancing performance by developing complex interaction modules. However, there +has been a recent shift toward refining loss functions and negative sampling +strategies. This shift has led to an increased interest in contrastive +learning, which pulls similar pairs closer while pushing dissimilar ones apart. +Contrastive learning may bring challenges like high memory demands and +under-utilization of some negative samples. The proposed Multi-Margin Cosine +Loss (MMCL) addresses these challenges by introducing multiple margins and +varying weights for negative samples. It efficiently utilizes not only the +hardest negatives but also other non-trivial negatives, offers a simpler yet +effective loss function that outperforms more complex methods, especially when +resources are limited. Experiments on two well-known datasets demonstrated that +MMCL achieved up to a 20\% performance improvement compared to a baseline loss +function when fewer number of negative samples are used. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Audio Topic Reranking using Large Language Models + + +
+ Multimodal Video Search by Examples (MVSE) investigates using video clips as +the query term for information retrieval, rather than the more traditional text +query. This enables far richer search modalities such as images, speaker, +content, topic, and emotion. A key element for this process is highly rapid and +flexible search to support large archives, which in MVSE is facilitated by +representing video attributes with embeddings. This work aims to compensate for +any performance loss from this rapid archive search by examining reranking +approaches. In particular, zero-shot reranking methods using large language +models (LLMs) are investigated as these are applicable to any video archive +audio content. Performance is evaluated for topic-based retrieval on a publicly +available video archive, the BBC Rewind corpus. Results demonstrate that +reranking significantly improves retrieval ranking without requiring any +task-specific in-domain training data. Furthermore, three sources of +information (ASR transcriptions, automatic summaries and synopses) as input for +LLM reranking were compared. To gain a deeper understanding and further +insights into the performance differences and limitations of these text +sources, we employ a fact-checking approach to analyse the information +consistency among them. + +
+
+
+
+
+ + ♻ ☆ A Counterfactual Explanation Framework for Retrieval Models + + +
+ Explainability has become a crucial concern in today's world, aiming to +enhance transparency in machine learning and deep learning models. Information +retrieval is no exception to this trend. In existing literature on +explainability of information retrieval, the emphasis has predominantly been on +illustrating the concept of relevance concerning a retrieval model. The +questions addressed include why a document is relevant to a query, why one +document exhibits higher relevance than another, or why a specific set of +documents is deemed relevant for a query. + However, limited attention has been given to understanding why a particular +document is considered non-relevant to a query with respect to a retrieval +model. In an effort to address this gap, our work focus on the question of what +terms need to be added within a document to improve its ranking. This in turn +answers the question of which words played a role in not being favored by a +retrieval model for a particular query. We use an optimization framework to +solve the above-mentioned research problem. % To the best of our knowledge, we +mark the first attempt to tackle this specific counterfactual problem. Our +experiments show the effectiveness of our proposed approach in predicting +counterfactuals for both statistical (e.g. BM25) and deep-learning-based models +(e.g. DRMM, DSSM, ColBERT). + +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ Benchmarking Sub-Genre Classification For Mainstage Dance Music ICASSP 2025 + + +
+ Music classification, with a wide range of applications, is one of the most +prominent tasks in music information retrieval. To address the absence of +comprehensive datasets and high-performing methods in the classification of +mainstage dance music, this work introduces a novel benchmark comprising a new +dataset and a baseline. Our dataset extends the number of sub-genres to cover +most recent mainstage live sets by top DJs worldwide in music festivals. A +continuous soft labeling approach is employed to account for tracks that span +multiple sub-genres, preserving the inherent sophistication. For the baseline, +we developed deep learning models that outperform current state-of-the-art +multimodel language models, which struggle to identify house music sub-genres, +emphasizing the need for specialized models trained on fine-grained datasets. +Our benchmark is applicable to serve for application scenarios such as music +recommendation, DJ set curation, and interactive multimedia, where we also +provide video demos. Our code is on +\url{https://anonymous.4open.science/r/Mainstage-EDM-Benchmark/}. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Distilling Generative-Discriminative Representations for Very + Low-Resolution Face Recognition + + +
+ Very low-resolution face recognition is challenging due to the serious loss +of informative facial details in resolution degradation. In this paper, we +propose a generative-discriminative representation distillation approach that +combines generative representation with cross-resolution aligned knowledge +distillation. This approach facilitates very low-resolution face recognition by +jointly distilling generative and discriminative models via two distillation +modules. Firstly, the generative representation distillation takes the encoder +of a diffusion model pretrained for face super-resolution as the generative +teacher to supervise the learning of the student backbone via feature +regression, and then freezes the student backbone. After that, the +discriminative representation distillation further considers a pretrained face +recognizer as the discriminative teacher to supervise the learning of the +student head via cross-resolution relational contrastive distillation. In this +way, the general backbone representation can be transformed into discriminative +head representation, leading to a robust and discriminative student model for +very low-resolution face recognition. Our approach improves the recovery of the +missing details in very low-resolution faces and achieves better knowledge +transfer. Extensive experiments on face datasets demonstrate that our approach +enhances the recognition accuracy of very low-resolution faces, showcasing its +effectiveness and adaptability. + +
+
+
+
+
+ + ☆ MIP-GAF: A MLLM-annotated Benchmark for Most Important Person + Localization and Group Context Understanding WACV 2025 + + +
+ Estimating the Most Important Person (MIP) in any social event setup is a +challenging problem mainly due to contextual complexity and scarcity of labeled +data. Moreover, the causality aspects of MIP estimation are quite subjective +and diverse. To this end, we aim to address the problem by annotating a +large-scale `in-the-wild' dataset for identifying human perceptions about the +`Most Important Person (MIP)' in an image. The paper provides a thorough +description of our proposed Multimodal Large Language Model (MLLM) based data +annotation strategy, and a thorough data quality analysis. Further, we perform +a comprehensive benchmarking of the proposed dataset utilizing state-of-the-art +MIP localization methods, indicating a significant drop in performance compared +to existing datasets. The performance drop shows that the existing MIP +localization algorithms must be more robust with respect to `in-the-wild' +situations. We believe the proposed dataset will play a vital role in building +the next-generation social situation understanding methods. The code and data +is available at https://github.com/surbhimadan92/MIP-GAF. + +
+
+ comment: Accepted for publication at WACV 2025 +
+
+
+
+
+ + ☆ Design and Implementation of Online Live Streaming System Using A 3D + Engine + + +
+ With the growing demand for live video streaming, there is an increasing need +for low-latency and high-quality transmission, especially with the advent of 5G +networks. While 5G offers hardware-level improvements, effective software +solutions for minimizing latency remain essential. Current methods, such as +multi-channel streaming, fail to address latency issues fundamentally, often +only adding new channels without optimizing overall performance. This thesis +proposes a novel approach using a 3D engine (e.g., Unity 3D) to stream +multi-input video data through a single channel with reduced latency. By +leveraging 3D engine capabilities, such as World/Screen Space Cameras, 3D +Canvases, and Webcam Textures, the proposed system consolidates video streams +from multiple external cameras into a unified, low-latency output. The +affiliated project of this thesis demonstrates the implementation of a +low-latency multi-channel live video streaming system. It employs the RTSP +protocol and examines video encoding techniques, alongside a client-side +application based on Unity 3D. The system architecture includes a WebSocket +server for persistent connections, an HTTP server for communication, a MySQL +database for storage, Redis for caching, and Nginx for load balancing. Each +module operates independently, ensuring flexibility and scalability in the +system's design. A key innovation of this system is its use of a 3D scene to +map multiple video inputs onto a virtual canvas, recorded by an in-engine +camera for transmission. This design minimizes redundant data, enabling an +efficient and director-guided live streaming network. The thesis concludes by +discussing challenges encountered during the project and provides solutions for +future improvement. + +
+
+
+
+
+ + ☆ Draw an Audio: Leveraging Multi-Instruction for Video-to-Audio Synthesis + + +
+ Foley is a term commonly used in filmmaking, referring to the addition of +daily sound effects to silent films or videos to enhance the auditory +experience. Video-to-Audio (V2A), as a particular type of automatic foley task, +presents inherent challenges related to audio-visual synchronization. These +challenges encompass maintaining the content consistency between the input +video and the generated audio, as well as the alignment of temporal and +loudness properties within the video. To address these issues, we construct a +controllable video-to-audio synthesis model, termed Draw an Audio, which +supports multiple input instructions through drawn masks and loudness signals. +To ensure content consistency between the synthesized audio and target video, +we introduce the Mask-Attention Module (MAM), which employs masked video +instruction to enable the model to focus on regions of interest. Additionally, +we implement the Time-Loudness Module (TLM), which uses an auxiliary loudness +signal to ensure the synthesis of sound that aligns with the video in both +loudness and temporal dimensions. Furthermore, we have extended a large-scale +V2A dataset, named VGGSound-Caption, by annotating caption prompts. Extensive +experiments on challenging benchmarks across two large-scale V2A datasets +verify Draw an Audio achieves the state-of-the-art. Project page: +https://yannqi.github.io/Draw-an-Audio/. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Do LLMs Understand Visual Anomalies? Uncovering LLM's Capabilities in + Zero-shot Anomaly Detection + + +
+ Large vision-language models (LVLMs) are markedly proficient in deriving +visual representations guided by natural language. Recent explorations have +utilized LVLMs to tackle zero-shot visual anomaly detection (VAD) challenges by +pairing images with textual descriptions indicative of normal and abnormal +conditions, referred to as anomaly prompts. However, existing approaches depend +on static anomaly prompts that are prone to cross-semantic ambiguity, and +prioritize global image-level representations over crucial local pixel-level +image-to-text alignment that is necessary for accurate anomaly localization. In +this paper, we present ALFA, a training-free approach designed to address these +challenges via a unified model. We propose a run-time prompt adaptation +strategy, which first generates informative anomaly prompts to leverage the +capabilities of a large language model (LLM). This strategy is enhanced by a +contextual scoring mechanism for per-image anomaly prompt adaptation and +cross-semantic ambiguity mitigation. We further introduce a novel fine-grained +aligner to fuse local pixel-level semantics for precise anomaly localization, +by projecting the image-text alignment from global to local semantic spaces. +Extensive evaluations on MVTec and VisA datasets confirm ALFA's effectiveness +in harnessing the language potential for zero-shot VAD, achieving significant +PRO improvements of 12.1% on MVTec and 8.9% on VisA compared to +state-of-the-art approaches. + +
+
+ comment: Accepted by MM'24 (Oral) +
+
+
+
+
+ + ♻ ☆ Question-Answering Dense Video Events + + +
+ Multimodal Large Language Models (MLLMs) have shown excellent performance in +question-answering of single-event videos. In this paper, we present +question-answering dense video events, a novel task that requires answering and +grounding the dense-event questions in long videos, thus challenging MLLMs to +faithfully comprehend and reason about multiple events occurring over extended +time periods. To facilitate the study, we construct DeVE-QA - a dataset +featuring 78K questions about 26K events on 10.6K long videos. We then +benchmark and show that existing MLLMs excelling at single-event QA struggle to +perform well in DeVE-QA. For improvement, we propose DeVi, a novel +training-free MLLM approach that highlights a hierarchical captioning module, a +temporal event memory module, and a self-consistency checking module to +respectively detect, contextualize and memorize, and ground dense-events in +long videos for question answering. Extensive experiments show that DeVi is +superior at answering dense-event questions and grounding relevant video +moments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1 +percent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA +respectively. + +
+
+
+
+
+ + ♻ ☆ FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video + Reconstruction in Resource and Timing Constrained Network Settings + + +
+ Despite the growing adoption of video processing via Internet of Things (IoT) +devices due to their cost-effectiveness, transmitting captured data to nearby +servers poses challenges due to varying timing constraints and scarcity of +network bandwidth. Existing video compression methods face difficulties in +recovering compressed data when incomplete data is provided. Here, we introduce +FrameCorr, a deep-learning based solution that utilizes previously received +data to predict the missing segments of a frame, enabling the reconstruction of +a frame from partially received data. + +
+
+
+
+
+ + ♻ ☆ Machine Learning-Driven Open-Source Framework for Assessing QoE in + Multimedia Networks + + +
+ The Internet is integral to modern life, influencing communication, business, +and lifestyles globally. As dependence on Internet services grows, the demand +for high-quality service delivery increases. Service providers must maintain +high standards of quality of service and quality of experience (QoE) to ensure +user satisfaction. QoE, which reflects user satisfaction with service quality, +is a key metric for multimedia services, yet it is challenging to measure due +to its subjective nature and the complexities of real-time feedback. This paper +introduces a machine learning-based framework for objectively assessing QoE in +multimedia networks. The open-source framework complies with the ITU-T P.1203 +standard. It automates data collection and user satisfaction prediction using +key network parameters such as delay, jitter, packet loss, bitrate, and +throughput. Using a dataset of over 20,000 records from various network +conditions, the Random Forest model predicts the mean opinion score with 95.8% +accuracy. Our framework addresses the limitations of existing QoE models by +integrating real-time data collection, machine learning predictions, and +adherence to international standards. This approach enhances QoE evaluation +accuracy and allows dynamic network resource management, optimizing performance +and cost-efficiency. Its open-source nature encourages adaptation and extension +for various multimedia services. The findings significantly affect the +telecommunications industry in managing and optimizing multimedia services. The +network centric QoE prediction of the framework offers a scalable solution to +improve user satisfaction without the need for content-specific data. Future +enhancements could include advanced machine learning models and broader +applicability to digital services. This research contributes a practical, +standardized tool for QoE assessment across diverse networks and platforms. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Benchmarking Chinese Knowledge Rectification in Large Language Models + + +
+ While Large Language Models (LLMs) exhibit remarkable generative +capabilities, they are not without flaws, particularly in the form of +hallucinations. This issue is even more pronounced when LLMs are applied to +specific languages and domains. For example, LLMs may generate nonsense +information when handling Chinese ancient poetry, proverbs, or idioms, owing to +the lack of specific knowledge. To this end, this paper introduces a benchmark +for rectifying Chinese knowledge in LLMs via knowledge editing. Specifically, +we introduce a new Chinese dataset, CKnowEdit, by collecting seven type of +knowledge from various sources, including classical texts, idioms, and content +from Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony, +antithesis, and logical constructs inherent in the Chinese language. Through +the analysis of this dataset, we uncover the challenges faced by current LLMs +in mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge +editing techniques on this dataset unveil the substantial scope for advancement +in the rectification of Chinese knowledge. Code and dataset are available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work; code and dataset are available at + https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ OneEdit: A Neural-Symbolic Collaboratively Knowledge Editing System VLDB2024 + + +
+ Knowledge representation has been a central aim of AI since its inception. +Symbolic Knowledge Graphs (KGs) and neural Large Language Models (LLMs) can +both represent knowledge. KGs provide highly accurate and explicit knowledge +representation, but face scalability issue; while LLMs offer expansive coverage +of knowledge, but incur significant training costs and struggle with precise +and reliable knowledge manipulation. To this end, we introduce OneEdit, a +neural-symbolic prototype system for collaborative knowledge editing using +natural language, which facilitates easy-to-use knowledge management with KG +and LLM. OneEdit consists of three modules: 1) The Interpreter serves for user +interaction with natural language; 2) The Controller manages editing requests +from various users, leveraging the KG with rollbacks to handle knowledge +conflicts and prevent toxic knowledge attacks; 3) The Editor utilizes the +knowledge from the Controller to edit KG and LLM. We conduct experiments on two +new datasets with KGs which demonstrate that OneEdit can achieve superior +performance. + +
+
+ comment: LLM+KG@VLDB2024, code is available at + https://github.com/zjunlp/OneEdit +
+
+
+
+
+ + ☆ Extracting the U.S. building types from OpenStreetMap data + + +
+ Building type information is crucial for population estimation, traffic +planning, urban planning, and emergency response applications. Although +essential, such data is often not readily available. To alleviate this problem, +this work creates a comprehensive dataset by providing +residential/non-residential building classification covering the entire United +States. We propose and utilize an unsupervised machine learning method to +classify building types based on building footprints and available +OpenStreetMap information. The classification result is validated using +authoritative ground truth data for select counties in the U.S. The validation +shows a high precision for non-residential building classification and a high +recall for residential buildings. We identified various approaches to improving +the quality of the classification, such as removing sheds and garages from the +dataset. Furthermore, analyzing the misclassifications revealed that they are +mainly due to missing and scarce metadata in OSM. A major result of this work +is the resulting dataset of classifying 67,705,475 buildings. We hope that this +data is of value to the scientific community, including urban and +transportation planners. + +
+
+
+
+
+ + ☆ RegNLP in Action: Facilitating Compliance Through Automated Information + Retrieval and Answer Generation + + +
+ Regulatory documents, issued by governmental regulatory bodies, establish +rules, guidelines, and standards that organizations must adhere to for legal +compliance. These documents, characterized by their length, complexity and +frequent updates, are challenging to interpret, requiring significant +allocation of time and expertise on the part of organizations to ensure ongoing +compliance.Regulatory Natural Language Processing (RegNLP) is a +multidisciplinary subfield aimed at simplifying access to and interpretation of +regulatory rules and obligations. We define an Automated Question-Passage +Generation task for RegNLP, create the ObliQA dataset containing 27,869 +questions derived from the Abu Dhabi Global Markets (ADGM) financial regulation +document collection, design a baseline Regulatory Information Retrieval and +Answer Generation system, and evaluate it with RePASs, a novel evaluation +metric that tests whether generated answers accurately capture all relevant +obligations and avoid contradictions. + +
+
+
+
+
+ + ☆ Enhancing Graph Contrastive Learning with Reliable and Informative + Augmentation for Recommendation + + +
+ Graph neural network (GNN) has been a powerful approach in collaborative +filtering (CF) due to its ability to model high-order user-item relationships. +Recently, to alleviate the data sparsity and enhance representation learning, +many efforts have been conducted to integrate contrastive learning (CL) with +GNNs. Despite the promising improvements, the contrastive view generation based +on structure and representation perturbations in existing methods potentially +disrupts the collaborative information in contrastive views, resulting in +limited effectiveness of positive alignment. To overcome this issue, we propose +CoGCL, a novel framework that aims to enhance graph contrastive learning by +constructing contrastive views with stronger collaborative information via +discrete codes. The core idea is to map users and items into discrete codes +rich in collaborative information for reliable and informative contrastive view +generation. To this end, we initially introduce a multi-level vector quantizer +in an end-to-end manner to quantize user and item representations into discrete +codes. Based on these discrete codes, we enhance the collaborative information +of contrastive views by considering neighborhood structure and semantic +relevance respectively. For neighborhood structure, we propose virtual neighbor +augmentation by treating discrete codes as virtual neighbors, which expands an +observed user-item interaction into multiple edges involving discrete codes. +Regarding semantic relevance, we identify similar users/items based on shared +discrete codes and interaction targets to generate the semantically relevant +view. Through these strategies, we construct contrastive views with stronger +collaborative information and develop a triple-view graph contrastive learning +approach. Extensive experiments on four public datasets demonstrate the +effectiveness of our proposed approach. + +
+
+
+
+
+ + ☆ End-to-End Learnable Item Tokenization for Generative Recommendation + + +
+ Recently, generative recommendation has emerged as a promising new paradigm +that directly generates item identifiers for recommendation. However, a key +challenge lies in how to effectively construct item identifiers that are +suitable for recommender systems. Existing methods typically decouple item +tokenization from subsequent generative recommendation training, likely +resulting in suboptimal performance. To address this limitation, we propose +ETEGRec, a novel End-To-End Generative Recommender by seamlessly integrating +item tokenization and generative recommendation. Our framework is developed +based on the dual encoder-decoder architecture, which consists of an item +tokenizer and a generative recommender. In order to achieve mutual enhancement +between the two components, we propose a recommendation-oriented alignment +approach by devising two specific optimization objectives: sequence-item +alignment and preference-semantic alignment. These two alignment objectives can +effectively couple the learning of item tokenizer and generative recommender, +thereby fostering the mutual enhancement between the two components. Finally, +we further devise an alternating optimization method, to facilitate stable and +effective end-to-end learning of the entire framework. Extensive experiments +demonstrate the effectiveness of our proposed framework compared to a series of +traditional sequential recommendation models and generative recommendation +baselines. + +
+
+
+
+
+ + ☆ DatAasee -- A Metadata-Lake as Metadata Catalog for a Virtual Data-Lake + + +
+ Metadata management for distributed data sources is a long-standing but +ever-growing problem. To counter this challenge in a research-data and +library-oriented setting, this work constructs a data architecture, derived +from the data-lake: the metadata-lake. A proof-of-concept implementation of +this proposed metadata system is presented and evaluated as well. + +
+
+
+
+
+ + ☆ Recommender Systems Algorithm Selection for Ranking Prediction on + Implicit Feedback Datasets + + +
+ The recommender systems algorithm selection problem for ranking prediction on +implicit feedback datasets is under-explored. Traditional approaches in +recommender systems algorithm selection focus predominantly on rating +prediction on explicit feedback datasets, leaving a research gap for ranking +prediction on implicit feedback datasets. Algorithm selection is a critical +challenge for nearly every practitioner in recommender systems. In this work, +we take the first steps toward addressing this research gap. We evaluate the +NDCG@10 of 24 recommender systems algorithms, each with two hyperparameter +configurations, on 72 recommender systems datasets. We train four optimized +machine-learning meta-models and one automated machine-learning meta-model with +three different settings on the resulting meta-dataset. Our results show that +the predictions of all tested meta-models exhibit a median Spearman correlation +ranging from 0.857 to 0.918 with the ground truth. We show that the median +Spearman correlation between meta-model predictions and the ground truth +increases by an average of 0.124 when the meta-model is optimized to predict +the ranking of algorithms instead of their performance. Furthermore, in terms +of predicting the best algorithm for an unknown dataset, we demonstrate that +the best optimized traditional meta-model, e.g., XGBoost, achieves a recall of +48.6%, outperforming the best tested automated machine learning meta-model, +e.g., AutoGluon, which achieves a recall of 47.2%. + +
+
+ comment: Accepted for presentation at the 18th ACM Conference on Recommender + Systems in the Late-Breaking Results Track +
+
+
+
+
+ + ☆ Assessing SPARQL capabilities of Large Language Models + + +
+ The integration of Large Language Models (LLMs) with Knowledge Graphs (KGs) +offers significant synergistic potential for knowledge-driven applications. One +possible integration is the interpretation and generation of formal languages, +such as those used in the Semantic Web, with SPARQL being a core technology for +accessing KGs. In this paper, we focus on measuring out-of-the box capabilities +of LLMs to work with SPARQL and more specifically with SPARQL SELECT queries +applying a quantitative approach. + We implemented various benchmarking tasks in the LLM-KG-Bench framework for +automated execution and evaluation with several LLMs. The tasks assess +capabilities along the dimensions of syntax, semantic read, semantic create, +and the role of knowledge graph prompt inclusion. + With this new benchmarking tasks, we evaluated a selection of GPT, Gemini, +and Claude models. Our findings indicate that working with SPARQL SELECT +queries is still challenging for LLMs and heavily depends on the specific LLM +as well as the complexity of the task. While fixing basic syntax errors seems +to pose no problems for the best of the current LLMs evaluated, creating +semantically correct SPARQL SELECT queries is difficult in several cases. + +
+
+ comment: peer reviewed publication at NLP4KGc @ Semantics 2024, see + https://sites.google.com/view/3rdnlp4kgc +
+
+
+
+
+ + ☆ Replicability Measures for Longitudinal Information Retrieval Evaluation + + +
+ Information Retrieval (IR) systems are exposed to constant changes in most +components. Documents are created, updated, or deleted, the information needs +are changing, and even relevance might not be static. While it is generally +expected that the IR systems retain a consistent utility for the users, test +collection evaluations rely on a fixed experimental setup. Based on the +LongEval shared task and test collection, this work explores how the +effectiveness measured in evolving experiments can be assessed. Specifically, +the persistency of effectiveness is investigated as a replicability task. It is +observed how the effectiveness progressively deteriorates over time compared to +the initial measurement. Employing adapted replicability measures provides +further insight into the persistence of effectiveness. The ranking of systems +varies across retrieval measures and time. In conclusion, it was found that the +most effective systems are not necessarily the ones with the most persistent +performance. + +
+
+ comment: Experimental IR Meets Multilinguality, Multimodality, and Interaction + - 15th International Conference of the CLEF Association, CLEF 2024, Grenoble, + France, September 9-12, 2024, Proceedings. arXiv admin note: text overlap + with arXiv:2308.10549 +
+
+
+
+
+ + ☆ NLLB-E5: A Scalable Multilingual Retrieval Model + + +
+ Despite significant progress in multilingual information retrieval, the lack +of models capable of effectively supporting multiple languages, particularly +low-resource like Indic languages, remains a critical challenge. This paper +presents NLLB-E5: A Scalable Multilingual Retrieval Model. NLLB-E5 leverages +the in-built multilingual capabilities in the NLLB encoder for translation +tasks. It proposes a distillation approach from multilingual retriever E5 to +provide a zero-shot retrieval approach handling multiple languages, including +all major Indic languages, without requiring multilingual training data. We +evaluate the model on a comprehensive suite of existing benchmarks, including +Hindi-BEIR, highlighting its robust performance across diverse languages and +tasks. Our findings uncover task and domain-specific challenges, providing +valuable insights into the retrieval performance, especially for low-resource +languages. NLLB-E5 addresses the urgent need for an inclusive, scalable, and +language-agnostic text retrieval model, advancing the field of multilingual +information access and promoting digital inclusivity for millions of users +globally. + +
+
+
+
+
+ + ♻ ☆ PhishLang: A Lightweight, Client-Side Phishing Detection Framework using + MobileBERT for Real-Time, Explainable Threat Mitigation + + +
+ In this paper, we introduce PhishLang, an open-source, lightweight language +model specifically designed for phishing website detection through contextual +analysis of the website. Unlike traditional heuristic or machine learning +models that rely on static features and struggle to adapt to new threats, and +deep learning models that are computationally intensive, our model leverages +MobileBERT, a fast and memory-efficient variant of the BERT architecture, to +learn granular features characteristic of phishing attacks. PhishLang operates +with minimal data preprocessing and offers performance comparable to leading +deep learning anti-phishing tools, while being significantly faster and less +resource-intensive. Over a 3.5-month testing period, PhishLang successfully +identified 25,796 phishing URLs, many of which were undetected by popular +antiphishing blocklists, thus demonstrating its potential to enhance current +detection measures. Capitalizing on PhishLang's resource efficiency, we release +the first open-source fully client-side Chromium browser extension that +provides inference locally without requiring to consult an online blocklist and +can be run on low-end systems with no impact on inference times. Our +implementation not only outperforms prevalent (server-side) phishing tools, but +is significantly more effective than the limited commercial client-side +measures available. Furthermore, we study how PhishLang can be integrated with +GPT-3.5 Turbo to create explainable blocklisting -- which, upon detection of a +website, provides users with detailed contextual information about the features +that led to a website being marked as phishing. + +
+
+
+
+
+ + ♻ ☆ WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild + + +
+ The increasing availability of real-world conversation data offers exciting +opportunities for researchers to study user-chatbot interactions. However, the +sheer volume of this data makes manually examining individual conversations +impractical. To overcome this challenge, we introduce WildVis, an interactive +tool that enables fast, versatile, and large-scale conversation analysis. +WildVis provides search and visualization capabilities in the text and +embedding spaces based on a list of criteria. To manage million-scale datasets, +we implemented optimizations including search index construction, embedding +precomputation and compression, and caching to ensure responsive user +interactions within seconds. We demonstrate WildVis' utility through three case +studies: facilitating chatbot misuse research, visualizing and comparing topic +distributions across datasets, and characterizing user-specific conversation +patterns. WildVis is open-source and designed to be extendable, supporting +additional datasets and customized search and visualization functionalities. + +
+
+
+
+
+ + ♻ ☆ Understanding Fairness in Recommender Systems: A Healthcare Perspective + + +
+ Fairness in AI-driven decision-making systems has become a critical concern, +especially when these systems directly affect human lives. This paper explores +the public's comprehension of fairness in healthcare recommendations. We +conducted a survey where participants selected from four fairness metrics -- +Demographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive +Value -- across different healthcare scenarios to assess their understanding of +these concepts. Our findings reveal that fairness is a complex and often +misunderstood concept, with a generally low level of public understanding +regarding fairness metrics in recommender systems. This study highlights the +need for enhanced information and education on algorithmic fairness to support +informed decision-making in using these systems. Furthermore, the results +suggest that a one-size-fits-all approach to fairness may be insufficient, +pointing to the importance of context-sensitive designs in developing equitable +AI systems. + +
+
+ comment: Accepted to the 18th ACM Conference on Recommender Systems +
+
+
+
+
+ + ♻ ☆ ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG + Capabilities + + +
+ In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K +context window, designed to bridge the gap between open-source LLMs and leading +proprietary models (e.g., GPT-4-Turbo) in long-context understanding and +retrieval-augmented generation (RAG) capabilities. These two capabilities are +essential for LLMs to process large volumes of information that cannot fit into +a single prompt and are complementary to each other, depending on the +downstream tasks and computational budgets. We present a detailed continued +training recipe to extend the context window of Llama3-70B-base from 8K to 128K +tokens, along with a three-stage instruction tuning process to enhance the +model's instruction-following, RAG performance, and long-context understanding +capabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model +outperforms most existing state-of-the-art models, including +GPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on +ultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only +a 4K context window, showing the strong long context capability across varying +sequence lengths. We further provide extensive comparisons between direct +long-context and RAG solutions using the same state-of-the-art long-context +LLMs. Interestingly, we find that the performance of strong long-context LLMs +using RAG improves when retrieving a larger number of chunks. With a large set +of top-k chunks, RAG consistently outperforms direct long-context solution +using the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B +and Qwen2-72B-Instruct) on both 32K benchmarks and real-world 128K tasks. To +advance research in this field, we open-sourced the model weights, training +data, and the evaluation setup for the for the community: +https://chatqa2-project.github.io/ + +
+
+ comment: v2: major update with significantly improved results +
+
+
+
+
+
+
+
+ + Multimedia 13 + +
+
+
+ + ☆ REVISION: A Roadmap on Adaptive Video Streaming Optimization + + +
+ Due to the soaring popularity of video applications and the consequent rise +in video traffic on the Internet, technologies like HTTP Adaptive Streaming +(HAS) are crucial for delivering high Quality of Experience (QoE) to consumers. +HAS technology enables video players on consumer devices to enhance viewer +engagement by dynamically adapting video content quality based on network +conditions. This is especially relevant for consumer electronics as it ensures +an optimized viewing experience across a variety of devices, from smartphones +to smart TVs. This paper introduces REVISION, an efficient roadmap designed to +enhance adaptive video streaming, a core feature of modern consumer +electronics. The REVISION optimization triangle highlights three essential +aspects for improving streaming: Objective, Input Space, and Action Domain. +Additionally, REVISION proposes a novel layer-based architecture tailored to +refine video streaming systems, comprising Application, Control and Management, +and Resource layers. Each layer is designed to optimize different components of +the streaming process, which is directly linked to the performance and +efficiency of consumer devices. By adopting the principles of the REVISION, +manufacturers and developers can significantly improve the streaming +capabilities of consumer electronics, thereby enriching the consumer's +multimedia experience and accommodating the increasing demand for high-quality, +real-time video content. This approach addresses the complexities of today's +diverse video streaming ecosystem and paves the way for future advancements in +consumer technology. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ A CLIP-based siamese approach for meme classification + + +
+ Memes are an increasingly prevalent element of online discourse in social +networks, especially among young audiences. They carry ideas and messages that +range from humorous to hateful, and are widely consumed. Their potentially high +impact requires adequate means of control to moderate their use in large scale. +In this work, we propose SimCLIP a deep learning-based architecture for +cross-modal understanding of memes, leveraging a pre-trained CLIP encoder to +produce context-aware embeddings and a Siamese fusion technique to capture the +interactions between text and image. We perform an extensive experimentation on +seven meme classification tasks across six datasets. We establish a new state +of the art in Memotion7k with a 7.25% relative F1-score improvement, and +achieve super-human performance on Harm-P with 13.73% F1-Score improvement. Our +approach demonstrates the potential for compact meme classification models, +enabling accurate and efficient meme monitoring. We share our code at +https://github.com/jahuerta92/meme-classification-simclip + +
+
+
+
+
+ + ☆ A Toolkit for Joint Speaker Diarization and Identification with + Application to Speaker-Attributed ASR + + +
+ We present a modular toolkit to perform joint speaker diarization and speaker +identification. The toolkit can leverage on multiple models and algorithms +which are defined in a configuration file. Such flexibility allows our system +to work properly in various conditions (e.g., multiple registered speakers' +sets, acoustic conditions and languages) and across application domains (e.g. +media monitoring, institutional, speech analytics). In this demonstration we +show a practical use-case in which speaker-related information is used jointly +with automatic speech recognition engines to generate speaker-attributed +transcriptions. To achieve that, we employ a user-friendly web-based interface +to process audio and video inputs with the chosen configuration. + +
+
+ comment: Show and Tell paper. Presented at Interspeech 2024 +
+
+
+
+
+ + ☆ Audio-Visual Speaker Diarization: Current Databases, Approaches and + Challenges + + +
+ Nowadays, the large amount of audio-visual content available has fostered the +need to develop new robust automatic speaker diarization systems to analyse and +characterise it. This kind of system helps to reduce the cost of doing this +process manually and allows the use of the speaker information for different +applications, as a huge quantity of information is present, for example, images +of faces, or audio recordings. Therefore, this paper aims to address a critical +area in the field of speaker diarization systems, the integration of +audio-visual content of different domains. This paper seeks to push beyond +current state-of-the-art practices by developing a robust audio-visual speaker +diarization framework adaptable to various data domains, including TV +scenarios, meetings, and daily activities. Unlike most of the existing +audio-visual speaker diarization systems, this framework will also include the +proposal of an approach to lead the precise assignment of specific identities +in TV scenarios where celebrities appear. In addition, in this work, we have +conducted an extensive compilation of the current state-of-the-art approaches +and the existing databases for developing audio-visual speaker diarization. + +
+
+
+
+
+ + ☆ Exploring Rich Subjective Quality Information for Image Quality + Assessment in the Wild + + +
+ Traditional in the wild image quality assessment (IQA) models are generally +trained with the quality labels of mean opinion score (MOS), while missing the +rich subjective quality information contained in the quality ratings, for +example, the standard deviation of opinion scores (SOS) or even distribution of +opinion scores (DOS). In this paper, we propose a novel IQA method named +RichIQA to explore the rich subjective rating information beyond MOS to predict +image quality in the wild. RichIQA is characterized by two key novel designs: +(1) a three-stage image quality prediction network which exploits the powerful +feature representation capability of the Convolutional vision Transformer (CvT) +and mimics the short-term and long-term memory mechanisms of human brain; (2) a +multi-label training strategy in which rich subjective quality information like +MOS, SOS and DOS are concurrently used to train the quality prediction network. +Powered by these two novel designs, RichIQA is able to predict the image +quality in terms of a distribution, from which the mean image quality can be +subsequently obtained. Extensive experimental results verify that the +three-stage network is tailored to predict rich quality information, while the +multi-label training strategy can fully exploit the potentials within +subjective quality rating and enhance the prediction performance and +generalizability of the network. RichIQA outperforms state-of-the-art +competitors on multiple large-scale in the wild IQA databases with rich +subjective rating labels. The code of RichIQA will be made publicly available +on GitHub. + +
+
+
+
+
+ + ☆ Educational Virtual Field Trips based on Social VR and 360° Spaces + + +
+ Virtual field trips (VFTs) have proven to be valuable learning tools. Such +applications are mostly based on 360{\deg} technology and are to be +characterized as single-user applications in technological terms. In contrast, +Social VR applications are characterized by multi-user capability and +user-specific avatars. From a learning perspective, the concepts of +collaborative learning and embodiment have long been proposed as conducive to +learning. Both concepts might be supported using Social VR. However, little is +currently known about the use of Social VR for VFTs. Accordingly, the research +questions are to what extent VFTs can be implemented in Social VR environments +and how these Social VR-based VFTs are perceived by learners. This article +presents an evaluation study on the development and evaluation of a VFT +environment using the Social VR platform Mozilla Hubs. It describes the design +decisions to create the environment and evaluation results from a mixed-method +study (N=16) using a questionnaire and focus group discussions. The study +highlighted the opportunities offered by Social VR-based VFTs but also revealed +several challenges that need to be addressed to embrace the potential of Social +VR-based VFTs to be utilized regularly in education. + +
+
+ comment: 9 pages, 7 figures, 1 table, submitted to Games and Learning Alliance + Conference +
+
+
+
+
+ + ☆ Look One and More: Distilling Hybrid Order Relational Knowledge for + Cross-Resolution Image Recognition AAAI 2020 + + +
+ In spite of great success in many image recognition tasks achieved by recent +deep models, directly applying them to recognize low-resolution images may +suffer from low accuracy due to the missing of informative details during +resolution degradation. However, these images are still recognizable for +subjects who are familiar with the corresponding high-resolution ones. Inspired +by that, we propose a teacher-student learning approach to facilitate +low-resolution image recognition via hybrid order relational knowledge +distillation. The approach refers to three streams: the teacher stream is +pretrained to recognize high-resolution images in high accuracy, the student +stream is learned to identify low-resolution images by mimicking the teacher's +behaviors, and the extra assistant stream is introduced as bridge to help +knowledge transfer across the teacher to the student. To extract sufficient +knowledge for reducing the loss in accuracy, the learning of student is +supervised with multiple losses, which preserves the similarities in various +order relational structures. In this way, the capability of recovering missing +details of familiar low-resolution images can be effectively enhanced, leading +to a better knowledge transfer. Extensive experiments on metric learning, +low-resolution image classification and low-resolution face recognition tasks +show the effectiveness of our approach, while taking reduced models. + +
+
+ comment: Accepted by AAAI 2020 +
+
+
+
+
+ + ☆ KAN-Based Fusion of Dual-Domain for Audio-Driven Facial Landmarks + Generation + + +
+ Audio-driven talking face generation is a widely researched topic due to its +high applicability. Reconstructing a talking face using audio significantly +contributes to fields such as education, healthcare, online conversations, +virtual assistants, and virtual reality. Early studies often focused solely on +changing the mouth movements, which resulted in outcomes with limited practical +applications. Recently, researchers have proposed a new approach of +constructing the entire face, including face pose, neck, and shoulders. To +achieve this, they need to generate through landmarks. However, creating stable +landmarks that align well with the audio is a challenge. In this paper, we +propose the KFusion of Dual-Domain model, a robust model that generates +landmarks from audio. We separate the audio into two distinct domains to learn +emotional information and facial context, then use a fusion mechanism based on +the KAN model. Our model demonstrates high efficiency compared to recent +models. This will lay the groundwork for the development of the audio-driven +talking face generation problem in the future. + +
+
+
+
+
+ + ☆ Adaptive Offloading and Enhancement for Low-Light Video Analytics on + Mobile Devices + + +
+ In this paper, we explore adaptive offloading and enhancement strategies for +video analytics tasks on computing-constrained mobile devices in low-light +conditions. We observe that the accuracy of low-light video analytics varies +from different enhancement algorithms. The root cause could be the disparities +in the effectiveness of enhancement algorithms for feature extraction in +analytic models. Specifically, the difference in class activation maps (CAMs) +between enhanced and low-light frames demonstrates a positive correlation with +video analytics accuracy. Motivated by such observations, a novel enhancement +quality assessment method is proposed on CAMs to evaluate the effectiveness of +different enhancement algorithms for low-light videos. Then, we design a +multi-edge system, which adaptively offloads and enhances low-light video +analytics tasks from mobile devices. To achieve the trade-off between the +enhancement quality and the latency for all system-served mobile devices, we +propose a genetic-based scheduling algorithm, which can find a near-optimal +solution in a reasonable time to meet the latency requirement. Thereby, the +offloading strategies and the enhancement algorithms are properly selected +under the condition of limited end-edge bandwidth and edge computation +resources. Simulation experiments demonstrate the superiority of the proposed +system, improving accuracy up to 20.83\% compared to existing benchmarks. + +
+
+
+
+
+ + ♻ ☆ HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale + Space Using Wearable IMUs and LiDAR + + +
+ We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture +method, aimed at accurately and efficiently creating a dynamic digital world, +containing large-scale indoor-outdoor scenes, diverse human motions, rich +human-human interactions, and human-environment interactions. By utilizing +body-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human +motions in unconstrained space without the need for external devices and +pre-built maps. This affords great flexibility and accessibility for +human-centered interaction and 4D scene capturing in various environments. +Taking into account that IMUs can capture human spatially unrestricted poses +but are prone to drifting for long-period using, and while LiDAR is stable for +global localization but rough for local positions and orientations, HiSC4D +employs a joint optimization method, harmonizing all sensors and utilizing +environment cues, yielding promising results for long-term capture in large +scenes. To promote research of egocentric human interaction in large scenes and +facilitate downstream tasks, we also present a dataset, containing 8 sequences +in 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D +human motions with SMPL annotations and dynamic scenes, 31k frames of cropped +human point clouds, and scene mesh of the environment. A variety of scenarios, +such as the basketball gym and commercial street, alongside challenging human +motions, such as daily greeting, one-on-one basketball playing, and tour +guiding, demonstrate the effectiveness and the generalization ability of +HiSC4D. The dataset and code will be publicated on +www.lidarhumanmotion.net/hisc4d available for research purposes. + +
+
+ comment: 17 pages, 10 figures, Jornal +
+
+
+
+
+ + ♻ ☆ Auto-ACD: A Large-scale Dataset for Audio-Language Representation + Learning ACM MM 2024 + + +
+ Recently, the AI community has made significant strides in developing +powerful foundation models, driven by large-scale multimodal datasets. However, +for audio representation learning, existing datasets suffer from limitations in +the following aspects: insufficient volume, simplistic content, and arduous +collection procedures. To establish an audio dataset with high-quality +captions, we propose an innovative, automatic approach leveraging multimodal +inputs, such as video frames, audio streams. Specifically, we construct a +large-scale, high-quality, audio-language dataset, named as Auto-ACD, +comprising over 1.5M audio-text pairs. We exploit a series of pre-trained +models or APIs, to determine audio-visual synchronisation, generate image +captions, object detection, or audio tags for specific videos. Subsequently, we +employ LLM to paraphrase a congruent caption for each audio, guided by the +extracted multi-modality clues. To demonstrate the effectiveness of the +proposed dataset, we train widely used models on our dataset and show +performance improvement on various downstream tasks, for example, +audio-language retrieval, audio captioning, zero-shot classification. In +addition, we establish a novel benchmark with environmental information and +provide a benchmark for audio-text tasks. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Concept Conductor: Orchestrating Multiple Personalized Concepts in + Text-to-Image Synthesis + + +
+ The customization of text-to-image models has seen significant advancements, +yet generating multiple personalized concepts remains a challenging task. +Current methods struggle with attribute leakage and layout confusion when +handling multiple concepts, leading to reduced concept fidelity and semantic +consistency. In this work, we introduce a novel training-free framework, +Concept Conductor, designed to ensure visual fidelity and correct layout in +multi-concept customization. Concept Conductor isolates the sampling processes +of multiple custom models to prevent attribute leakage between different +concepts and corrects erroneous layouts through self-attention-based spatial +guidance. Additionally, we present a concept injection technique that employs +shape-aware masks to specify the generation area for each concept. This +technique injects the structure and appearance of personalized concepts through +feature fusion in the attention layers, ensuring harmony in the final image. +Extensive qualitative and quantitative experiments demonstrate that Concept +Conductor can consistently generate composite images with accurate layouts +while preserving the visual details of each concept. Compared to existing +baselines, Concept Conductor shows significant performance improvements. Our +method supports the combination of any number of concepts and maintains high +fidelity even when dealing with visually similar concepts. The code and models +are available at https://github.com/Nihukat/Concept-Conductor. + +
+
+ comment: Github Page: https://github.com/Nihukat/Concept-Conductor +
+
+
+
+
+ + ♻ ☆ 360VFI: A Dataset and Benchmark for Omnidirectional Video Frame + Interpolation + + +
+ Head-mounted 360{\deg} displays and portable 360{\deg} cameras have +significantly progressed, providing viewers a realistic and immersive +experience. However, many omnidirectional videos have low frame rates that can +lead to visual fatigue, and the prevailing plane frame interpolation +methodologies are unsuitable for omnidirectional video interpolation because +they are designed solely for traditional videos. This paper introduces the +benchmark dataset, 360VFI, for Omnidirectional Video Frame Interpolation. We +present a practical implementation that introduces a distortion prior from +omnidirectional video into the network to modulate distortions. Specifically, +we propose a pyramid distortion-sensitive feature extractor that uses the +unique characteristics of equirectangular projection (ERP) format as prior +information. Moreover, we devise a decoder that uses an affine transformation +to further facilitate the synthesis of intermediate frames. 360VFI is the first +dataset and benchmark that explores the challenge of Omnidirectional Video +Frame Interpolation. Through our benchmark analysis, we present four different +distortion condition scenes in the proposed 360VFI dataset to evaluate the +challenges triggered by distortion during interpolation. Besides, experimental +results demonstrate that Omnidirectional Video Interpolation can be effectively +improved by modeling for omnidirectional distortion. + +
+
+ comment: This is a preprint version +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs + + +
+ Despite the recent advancements in Large Language Models (LLMs), which have +significantly enhanced the generative capabilities for various NLP tasks, LLMs +still face limitations in directly handling retrieval tasks. However, many +practical applications demand the seamless integration of both retrieval and +generation. This paper introduces a novel and efficient One-pass Generation and +retrieval framework (OneGen), designed to improve LLMs' performance on tasks +that require both generation and retrieval. The proposed framework bridges the +traditionally separate training approaches for generation and retrieval by +incorporating retrieval tokens generated autoregressively. This enables a +single LLM to handle both tasks simultaneously in a unified forward pass. We +conduct experiments on two distinct types of composite tasks, RAG and Entity +Linking, to validate the pluggability, effectiveness, and efficiency of OneGen +in training and inference. Furthermore, our results show that integrating +generation and retrieval within the same context preserves the generative +capabilities of LLMs while improving retrieval performance. To the best of our +knowledge, OneGen is the first to enable LLMs to conduct vector retrieval +during the generation. + +
+
+ comment: Work in progress; code is available at + https://github.com/zjunlp/OneGen +
+
+
+
+
+ + ☆ A Survey on Diffusion Models for Recommender Systems + + +
+ While traditional recommendation techniques have made significant strides in +the past decades, they still suffer from limited generalization performance +caused by factors like inadequate collaborative signals, weak latent +representations, and noisy data. In response, diffusion models (DMs) have +emerged as promising solutions for recommender systems due to their robust +generative capabilities, solid theoretical foundations, and improved training +stability. To this end, in this paper, we present the first comprehensive +survey on diffusion models for recommendation, and draw a bird's-eye view from +the perspective of the whole pipeline in real-world recommender systems. We +systematically categorize existing research works into three primary domains: +(1) diffusion for data engineering & encoding, focusing on data augmentation +and representation enhancement; (2) diffusion as recommender models, employing +diffusion models to directly estimate user preferences and rank items; and (3) +diffusion for content presentation, utilizing diffusion models to generate +personalized content such as fashion and advertisement creatives. Our taxonomy +highlights the unique strengths of diffusion models in capturing complex data +distributions and generating high-quality, diverse samples that closely align +with user preferences. We also summarize the core characteristics of the +adapting diffusion models for recommendation, and further identify key areas +for future exploration, which helps establish a roadmap for researchers and +practitioners seeking to advance recommender systems through the innovative +application of diffusion models. To further facilitate the research community +of recommender systems based on diffusion models, we actively maintain a GitHub +repository for papers and other related resources in this rising direction +https://github.com/CHIANGEL/Awesome-Diffusion-for-RecSys. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Sequential Recommendation via Adaptive Robust Attention with + Multi-dimensional Embeddings + + +
+ Sequential recommendation models have achieved state-of-the-art performance +using self-attention mechanism. It has since been found that moving beyond only +using item ID and positional embeddings leads to a significant accuracy boost +when predicting the next item. In recent literature, it was reported that a +multi-dimensional kernel embedding with temporal contextual kernels to capture +users' diverse behavioral patterns results in a substantial performance +improvement. In this study, we further improve the sequential recommender +model's robustness and generalization by introducing a mix-attention mechanism +with a layer-wise noise injection (LNI) regularization. We refer to our +proposed model as adaptive robust sequential recommendation framework (ADRRec), +and demonstrate through extensive experiments that our model outperforms +existing self-attention architectures. + +
+
+
+
+
+ + ♻ ☆ Keyword-driven Retrieval-Augmented Large Language Models for Cold-start + User Recommendations + + +
+ Recent advancements in Large Language Models (LLMs) have shown significant +potential in enhancing recommender systems. However, addressing the cold-start +recommendation problem, where users lack historical data, remains a +considerable challenge. In this paper, we introduce KALM4Rec (Keyword-driven +Retrieval-Augmented Large Language Models for Cold-start User Recommendations), +a novel framework specifically designed to tackle this problem by requiring +only a few input keywords from users in a practical scenario of cold-start user +restaurant recommendations. KALM4Rec operates in two main stages: candidates +retrieval and LLM-based candidates re-ranking. In the first stage, +keyword-driven retrieval models are used to identify potential candidates, +addressing LLMs' limitations in processing extensive tokens and reducing the +risk of generating misleading information. In the second stage, we employ LLMs +with various prompting strategies, including zero-shot and few-shot techniques, +to re-rank these candidates by integrating multiple examples directly into the +LLM prompts. Our evaluation, using a Yelp restaurant dataset with user reviews +from three English-speaking cities, shows that our proposed framework +significantly improves recommendation quality. Specifically, the integration of +in-context instructions with LLMs for re-ranking markedly enhances the +performance of the cold-start user recommender system. + +
+
+ comment: 10 pages, 10 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ DREAM: A Dual Representation Learning Model for Multimodal + Recommendation + + +
+ Multimodal recommendation focuses primarily on effectively exploiting both +behavioral and multimodal information for the recommendation task. However, +most existing models suffer from the following issues when fusing information +from two different domains: (1) Previous works do not pay attention to the +sufficient utilization of modal information by only using direct concatenation, +addition, or simple linear layers for modal information extraction. (2) +Previous works treat modal features as learnable embeddings, which causes the +modal embeddings to gradually deviate from the original modal features during +learning. We refer to this issue as Modal Information Forgetting. (3) Previous +approaches fail to account for the significant differences in the distribution +between behavior and modality, leading to the issue of representation +misalignment. To address these challenges, this paper proposes a novel Dual +REpresentAtion learning model for Multimodal Recommendation called DREAM. For +sufficient information extraction, we introduce separate dual lines, including +Behavior Line and Modal Line, in which the Modal-specific Encoder is applied to +empower modal representations. To address the issue of Modal Information +Forgetting, we introduce the Similarity Supervised Signal to constrain the +modal representations. Additionally, we design a Behavior-Modal Alignment +module to fuse the dual representations through Intra-Alignment and +Inter-Alignment. Extensive experiments on three public datasets demonstrate +that the proposed DREAM method achieves state-of-the-art (SOTA) results. The +source code will be available upon acceptance. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Visual Grounding with Multi-modal Conditional Adaptation ACM MM 2024 + + +
+ Visual grounding is the task of locating objects specified by natural +language expressions. Existing methods extend generic object detection +frameworks to tackle this task. They typically extract visual and textual +features separately using independent visual and textual encoders, then fuse +these features in a multi-modal decoder for final prediction. However, visual +grounding presents unique challenges. It often involves locating objects with +different text descriptions within the same image. Existing methods struggle +with this task because the independent visual encoder produces identical visual +features for the same image, limiting detection performance. Some recently +approaches propose various language-guided visual encoders to address this +issue, but they mostly rely solely on textual information and require +sophisticated designs. In this paper, we introduce Multi-modal Conditional +Adaptation (MMCA), which enables the visual encoder to adaptively update +weights, directing its focus towards text-relevant regions. Specifically, we +first integrate information from different modalities to obtain multi-modal +embeddings. Then we utilize a set of weighting coefficients, which generated +from the multimodal embeddings, to reorganize the weight update matrices and +apply them to the visual encoder of the visual grounding model. Extensive +experiments on four widely used datasets demonstrate that MMCA achieves +significant improvements and state-of-the-art results. Ablation experiments +further demonstrate the lightweight and efficiency of our method. Our source +code is available at: https://github.com/Mr-Bigworth/MMCA. + +
+
+ comment: Accepted by ACM MM 2024 [Oral] +
+
+
+
+
+ + ♻ ☆ Estimating Indoor Scene Depth Maps from Ultrasonic Echoes ICIP 2024 + + +
+ Measuring 3D geometric structures of indoor scenes requires dedicated depth +sensors, which are not always available. Echo-based depth estimation has +recently been studied as a promising alternative solution. All previous studies +have assumed the use of echoes in the audible range. However, one major problem +is that audible echoes cannot be used in quiet spaces or other situations where +producing audible sounds is prohibited. In this paper, we consider echo-based +depth estimation using inaudible ultrasonic echoes. While ultrasonic waves +provide high measurement accuracy in theory, the actual depth estimation +accuracy when ultrasonic echoes are used has remained unclear, due to its +disadvantage of being sensitive to noise and susceptible to attenuation. We +first investigate the depth estimation accuracy when the frequency of the sound +source is restricted to the high-frequency band, and found that the accuracy +decreased when the frequency was limited to ultrasonic ranges. Based on this +observation, we propose a novel deep learning method to improve the accuracy of +ultrasonic echo-based depth estimation by using audible echoes as auxiliary +data only during training. Experimental results with a public dataset +demonstrate that our method improves the estimation accuracy. + +
+
+ comment: ICIP 2024 +
+
+
+
+
+ + ♻ ☆ FakeBench: Probing Explainable Fake Image Detection via Large Multimodal + Models + + +
+ The ability to distinguish whether an image is generated by artificial +intelligence (AI) is a crucial ingredient in human intelligence, usually +accompanied by a complex and dialectical forensic and reasoning process. +However, current fake image detection models and databases focus on binary +classification without understandable explanations for the general populace. +This weakens the credibility of authenticity judgment and may conceal potential +model biases. Meanwhile, large multimodal models (LMMs) have exhibited immense +visual-text capabilities on various tasks, bringing the potential for +explainable fake image detection. Therefore, we pioneer the probe of LMMs for +explainable fake image detection by presenting a multimodal database +encompassing textual authenticity descriptions, the FakeBench. For +construction, we first introduce a fine-grained taxonomy of generative visual +forgery concerning human perception, based on which we collect forgery +descriptions in human natural language with a human-in-the-loop strategy. +FakeBench examines LMMs with four evaluation criteria: detection, reasoning, +interpretation and fine-grained forgery analysis, to obtain deeper insights +into image authenticity-relevant capabilities. Experiments on various LMMs +confirm their merits and demerits in different aspects of fake image detection +tasks. This research presents a paradigm shift towards transparency for the +fake image detection area and reveals the need for greater emphasis on forensic +elements in visual-language research and AI risk control. FakeBench will be +available at https://github.com/Yixuan423/FakeBench. + +
+
+
+
+
+ + ♻ ☆ DREAM: A Dual Representation Learning Model for Multimodal + Recommendation + + +
+ Multimodal recommendation focuses primarily on effectively exploiting both +behavioral and multimodal information for the recommendation task. However, +most existing models suffer from the following issues when fusing information +from two different domains: (1) Previous works do not pay attention to the +sufficient utilization of modal information by only using direct concatenation, +addition, or simple linear layers for modal information extraction. (2) +Previous works treat modal features as learnable embeddings, which causes the +modal embeddings to gradually deviate from the original modal features during +learning. We refer to this issue as Modal Information Forgetting. (3) Previous +approaches fail to account for the significant differences in the distribution +between behavior and modality, leading to the issue of representation +misalignment. To address these challenges, this paper proposes a novel Dual +REpresentAtion learning model for Multimodal Recommendation called DREAM. For +sufficient information extraction, we introduce separate dual lines, including +Behavior Line and Modal Line, in which the Modal-specific Encoder is applied to +empower modal representations. To address the issue of Modal Information +Forgetting, we introduce the Similarity Supervised Signal to constrain the +modal representations. Additionally, we design a Behavior-Modal Alignment +module to fuse the dual representations through Intra-Alignment and +Inter-Alignment. Extensive experiments on three public datasets demonstrate +that the proposed DREAM method achieves state-of-the-art (SOTA) results. The +source code will be available upon acceptance. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`